diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/.appveyor.yml similarity index 54% rename from packages/kokkos/appveyor.yml rename to packages/kokkos/.appveyor.yml index c0b6e9cab9f73fc180796a4d154b758b71037a8f..23cac222ca38240d5529f84f885f0145f3dea550 100644 --- a/packages/kokkos/appveyor.yml +++ b/packages/kokkos/.appveyor.yml @@ -5,6 +5,6 @@ build_script: - cmd: >- mkdir build && cd build && - cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && + cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON && cmake --build . --target install && ctest -C Debug --output-on-failure diff --git a/packages/kokkos/.clang-format b/packages/kokkos/.clang-format index db5f94fa2ebb6f3d343ff9ce86507229c2b990f9..090edc2c51f1e5fd49286cd3e911679afd4f6bea 100644 --- a/packages/kokkos/.clang-format +++ b/packages/kokkos/.clang-format @@ -1,4 +1,4 @@ -#Official Tool: clang-format version 8.0.0 +#Official Tool: clang-format version 16.0.0 BasedOnStyle: google SortIncludes: false AlignConsecutiveAssignments: true diff --git a/packages/kokkos/.clang-format-ignore b/packages/kokkos/.clang-format-ignore deleted file mode 100644 index 43d242c3106a29c063e0258bbd4e4553f66f883c..0000000000000000000000000000000000000000 --- a/packages/kokkos/.clang-format-ignore +++ /dev/null @@ -1,3 +0,0 @@ -core/unit_test/config/results/* -tpls/gtest/gtest/* -core/src/desul/* diff --git a/packages/kokkos/.clang-tidy b/packages/kokkos/.clang-tidy index 2b0d6e51d438948c2d5ef85e100c97ca16184e9b..f1aba1f52e5f3c73ac7dd7c9bc60c89544a244eb 100644 --- a/packages/kokkos/.clang-tidy +++ b/packages/kokkos/.clang-tidy @@ -1,3 +1,3 @@ -Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast' +Checks: '-*,kokkos-*,modernize-type-traits,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast' FormatStyle: file -HeaderFilterRegex: '.*/*.hpp' +HeaderFilterRegex: '(algorithms|benchmarks|containers|core|example|simd).*\.hpp' diff --git a/packages/kokkos/.cmake-format.py b/packages/kokkos/.cmake-format.py new file mode 100644 index 0000000000000000000000000000000000000000..6a66b6a14ec820668cff9d26bf91c685bc861ebe --- /dev/null +++ b/packages/kokkos/.cmake-format.py @@ -0,0 +1,28 @@ +# ----------------------------- +# Options affecting formatting. +# ----------------------------- +with section("format"): + + # How wide to allow formatted cmake files + line_width = 120 + + # If an argument group contains more than this many sub-groups (parg or kwarg + # groups) then force it to a vertical layout. + max_subgroups_hwrap = 3 + + # If a statement is wrapped to more than one line, than dangle the closing + # parenthesis on its own line. + dangle_parens = True + + # If the trailing parenthesis must be 'dangled' on its on line, then align it + # to this reference: `prefix`: the start of the statement, `prefix-indent`: + # the start of the statement, plus one indentation level, `child`: align to + # the column of the arguments + dangle_align = 'prefix' + +# ------------------------------------------------ +# Options affecting comment reflow and formatting. +# ------------------------------------------------ +with section("markup"): + # enable comment markup parsing and reflow + enable_markup = False diff --git a/packages/kokkos/.codecov.yml b/packages/kokkos/.codecov.yml deleted file mode 100644 index 097b0264a272ece51c38932b6f2486f75234f040..0000000000000000000000000000000000000000 --- a/packages/kokkos/.codecov.yml +++ /dev/null @@ -1,11 +0,0 @@ -coverage: - precision: 1 - round: down - range: "70...100" -ignore: - - tpls/ - - algorithms/unit_tests - - core/perf_test/ - - core/unit_test/ - - containers/performance_tests - - containers/unit_tests diff --git a/packages/kokkos/.git-blame-ignore-revs b/packages/kokkos/.git-blame-ignore-revs new file mode 100644 index 0000000000000000000000000000000000000000..2a108a1c2ea83a32d304101552a5311886127669 --- /dev/null +++ b/packages/kokkos/.git-blame-ignore-revs @@ -0,0 +1,6 @@ +# Formatted CMake files with cmake-format +0247634f35e2f9e6b9dec3c80cae567b15027554 +# Moved to clang-format-16 +60fb9cc94b40e698cbc3278c5538f58dee721276 +# Formatted the entire codebase with ClangFormat 8 +77d8965fa2b123e5172ac5ea361e731f7fc52fc8 diff --git a/packages/kokkos/.gitattributes b/packages/kokkos/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..2af9a724d4884a4183c574af7c9291fc8609f8d6 --- /dev/null +++ b/packages/kokkos/.gitattributes @@ -0,0 +1,8 @@ +.clang* export-ignore +.cmake* export-ignore +.git* export-ignore +.jenkins* export-ignore +.appveyor.yml export-ignore +.*gitlab-ci.yml export-ignore +scripts/ export-ignore +LICENSE_FILE_HEADER export-ignore diff --git a/packages/kokkos/.github/dependabot.yml b/packages/kokkos/.github/dependabot.yml new file mode 100644 index 0000000000000000000000000000000000000000..7242bd2851f4a8b11c007675fec335fe41a87db4 --- /dev/null +++ b/packages/kokkos/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: / + schedule: + interval: "weekly" diff --git a/packages/kokkos/.github/workflows/clang-format-check.yml b/packages/kokkos/.github/workflows/clang-format-check.yml new file mode 100644 index 0000000000000000000000000000000000000000..60d49d78bf61f79a878799ed568ce13478369261 --- /dev/null +++ b/packages/kokkos/.github/workflows/clang-format-check.yml @@ -0,0 +1,16 @@ +name: clang-format check + +on: + workflow_call: + +permissions: read-all + +jobs: + clang-formatting-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - name: Run clang-format style check. + uses: DoozyX/clang-format-lint-action@c71d0bf4e21876ebec3e5647491186f8797fde31 # v0.18.2 + with: + clangFormatVersion: 16 diff --git a/packages/kokkos/.github/workflows/cmake-format-check.yml b/packages/kokkos/.github/workflows/cmake-format-check.yml new file mode 100644 index 0000000000000000000000000000000000000000..af79e50a951232d7153cb51e65c2fadc4ad01a5a --- /dev/null +++ b/packages/kokkos/.github/workflows/cmake-format-check.yml @@ -0,0 +1,18 @@ +name: cmake-format check + +on: + workflow_call: + +permissions: read-all + +jobs: + cmake-formatting-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - name: cmake-format lint action + uses: puneetmatharu/cmake-format-lint-action@efbb497b2a8badd2c9dc638faaf8ef4a9aa71bc8 # v1.0.4 + with: + args: --config-files .cmake-format.py --in-place + - name: check + run: git diff --exit-code diff --git a/packages/kokkos/.github/workflows/codeql.yml b/packages/kokkos/.github/workflows/codeql.yml new file mode 100644 index 0000000000000000000000000000000000000000..ab85380548bdaf74e569ef28075ff7d50b9a61bd --- /dev/null +++ b/packages/kokkos/.github/workflows/codeql.yml @@ -0,0 +1,48 @@ +name: "CodeQL" + +on: + push: + branches: [ "master", "develop", "release-*" ] + workflow_call: + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + timeout-minutes: 360 + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + with: + languages: c-cpp + + - name: configure + run: + cmake -B build . + -DKokkos_ENABLE_OPENMP=ON + -DCMAKE_CXX_STANDARD=17 + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF + -DKokkos_ENABLE_TESTS=ON + -DKokkos_ENABLE_EXAMPLES=ON + -DKokkos_ENABLE_BENCHMARKS=ON + -DCMAKE_BUILD_TYPE=Debug + - name: build + run: + cmake --build build --parallel 2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + with: + category: "/language:c-cpp" diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow-32bit.yml b/packages/kokkos/.github/workflows/continuous-integration-linux-32bit.yml similarity index 78% rename from packages/kokkos/.github/workflows/continuous-integration-workflow-32bit.yml rename to packages/kokkos/.github/workflows/continuous-integration-linux-32bit.yml index 68fbdbe8a4764d1e787361f99f3430bb26bbf8f7..02e454a931a98d4d41780570e0d9e0ffc47f356d 100644 --- a/packages/kokkos/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/packages/kokkos/.github/workflows/continuous-integration-linux-32bit.yml @@ -1,16 +1,12 @@ name: github-Linux-32bit on: - push: - branches: - - develop - pull_request: - paths-ignore: - - '**/*.md' - types: [ opened, reopened, synchronize ] + workflow_call: + +permissions: read-all concurrency: - group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}-linux-x86 cancel-in-progress: ${{github.event_name == 'pull_request'}} jobs: @@ -21,7 +17,7 @@ jobs: image: ghcr.io/kokkos/ci-containers/ubuntu:latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: install_multilib run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib - name: Configure Kokkos @@ -32,9 +28,8 @@ jobs: -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DCMAKE_CXX_FLAGS="-Werror -m32 -DKOKKOS_IMPL_32BIT" \ + -DCMAKE_CXX_FLAGS="-Werror -m32" \ -DCMAKE_CXX_COMPILER=g++ \ -DCMAKE_BUILD_TYPE=RelWithDebInfo - name: Build diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml b/packages/kokkos/.github/workflows/continuous-integration-linux-hpx.yml similarity index 88% rename from packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml rename to packages/kokkos/.github/workflows/continuous-integration-linux-hpx.yml index 8b39350dc8765f871d389eae56acc4b90a7af149..869af90f7fb08d0649580f94d6e328be077722d9 100644 --- a/packages/kokkos/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/packages/kokkos/.github/workflows/continuous-integration-linux-hpx.yml @@ -1,18 +1,14 @@ name: github-Linux-hpx on: - push: - branches: - - develop - pull_request: - paths-ignore: - - '**/*.md' - types: [ opened, reopened, synchronize ] + workflow_call: concurrency: - group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}-linux-x64-hpx cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: hpx: name: hpx @@ -20,7 +16,7 @@ jobs: steps: - name: checkout code - uses: actions/checkout@v3 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: path: kokkos - name: setup hpx dependencies @@ -33,12 +29,12 @@ jobs: libboost-all-dev \ ninja-build - name: checkout hpx - uses: actions/checkout@v3 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: STELLAR-GROUP/hpx - ref: 1.8.0 + ref: v1.9.0 path: hpx - - uses: actions/cache@v3 + - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 id: cache-hpx with: path: ./hpx/install diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-linux.yml similarity index 68% rename from packages/kokkos/.github/workflows/continuous-integration-workflow.yml rename to packages/kokkos/.github/workflows/continuous-integration-linux.yml index 8c226c3766c7c2563d82e6adaace3bcbcd3d1b9b..e8d10becdebc08eb1db23dce2c31496301cb5db0 100644 --- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml +++ b/packages/kokkos/.github/workflows/continuous-integration-linux.yml @@ -1,68 +1,81 @@ name: github-Linux on: - push: - branches: - - develop - pull_request: - paths-ignore: - - '**/*.md' - types: [ opened, reopened, synchronize ] + workflow_call: concurrency: - group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}-linux-x64 cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: CI: continue-on-error: true strategy: matrix: - distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] + distro: ['fedora:latest', 'ubuntu:latest'] cxx: ['g++', 'clang++'] cxx_extra_flags: [''] cmake_build_type: ['Release', 'Debug'] backend: ['OPENMP'] clang-tidy: [''] + stdcxx: [17] include: - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' - cxx_extra_flags: '-diag-disable=177,10441' + cxx_extra_flags: '-diag-disable=177,1478,1786,10441' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + stdcxx: '17' + - distro: 'ubuntu:intel' cxx: 'icpc' - cxx_extra_flags: '-diag-disable=177,10441' + cxx_extra_flags: '-diag-disable=177,1478,1786,10441' cmake_build_type: 'Debug' backend: 'OPENMP' - - distro: 'fedora:intel' + stdcxx: '17' + - distro: 'ubuntu:intel' cxx: 'icpx' - cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' + cxx_extra_flags: '-fp-model=precise -Wno-pass-failed -fsanitize=address,undefined -fno-sanitize=function -fno-sanitize=vptr -fno-sanitize-recover=all' + extra_linker_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize=vptr -fno-sanitize-recover=all' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + stdcxx: '17' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Debug' backend: 'OPENMP' + stdcxx: '20' - distro: 'ubuntu:latest' cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize=vptr -fno-sanitize-recover=all' + extra_linker_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize=vptr -fno-sanitize-recover=all' cmake_build_type: 'RelWithDebInfo' backend: 'THREADS' clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"' + stdcxx: '23' + - distro: 'ubuntu:latest' + cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize=vptr -fno-sanitize-recover=all' + extra_linker_flags: '-fsanitize=address,undefined -fno-sanitize=function -fno-sanitize=vptr -fno-sanitize-recover=all' + cmake_build_type: 'RelWithDebInfo' + backend: 'SERIAL' + stdcxx: '20' - distro: 'ubuntu:latest' cxx: 'g++' cmake_build_type: 'RelWithDebInfo' backend: 'THREADS' + stdcxx: '23' runs-on: ubuntu-latest container: image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} steps: - name: Checkout desul - uses: actions/checkout@v3 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 with: repository: desul/desul - ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b + ref: 22931326247c9333cc909628004d75ee5de99fa2 path: desul - name: Install desul working-directory: desul @@ -74,8 +87,8 @@ jobs: cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install .. sudo cmake --build . --target install --parallel 2 - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }} @@ -94,6 +107,7 @@ jobs: cmake -B builddir \ -DCMAKE_INSTALL_PREFIX=/usr \ ${{ matrix.clang-tidy }} \ + -DBUILD_SHARED_LIBS=ON \ -Ddesul_ROOT=/usr/desul-install/ \ -DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \ -DKokkos_ENABLE_HWLOC=ON \ @@ -102,10 +116,10 @@ jobs: -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_IMPL_MDSPAN=ON \ -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \ + -DCMAKE_CXX_STANDARD="${{ matrix.stdcxx }}" \ + -DCMAKE_EXE_LINKER_FLAGS="${{ matrix.extra_linker_flags }}" \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} @@ -118,6 +132,7 @@ jobs: working-directory: builddir run: ctest --output-on-failure - name: Test linking against build dir + if: ${{ !contains(matrix.cxx_extra_flags, '-fsanitize=address') }} working-directory: example/build_cmake_installed run: | cmake -B builddir_buildtree -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DKokkos_ROOT=../../builddir @@ -128,6 +143,7 @@ jobs: - name: Install run: sudo cmake --build builddir --target install - name: Test install + if: ${{ !contains(matrix.cxx_extra_flags, '-fsanitize=address') }} working-directory: example/build_cmake_installed run: | cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/continuous-integration-osx.yml similarity index 86% rename from packages/kokkos/.github/workflows/osx.yml rename to packages/kokkos/.github/workflows/continuous-integration-osx.yml index 85b079e56c8c2dc96a7ed7bc502086ffd2370e7a..103af7555941f434ba2a8fc2a5f98113dc334a6a 100644 --- a/packages/kokkos/.github/workflows/osx.yml +++ b/packages/kokkos/.github/workflows/continuous-integration-osx.yml @@ -1,18 +1,14 @@ name: github-OSX on: - push: - branches: - - develop - pull_request: - paths-ignore: - - '**/*.md' - types: [ opened, reopened, synchronize ] + workflow_call: concurrency: - group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}-osx-x64 cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: osxci: name: osx-ci @@ -31,7 +27,7 @@ jobs: cmake_build_type: "Release" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - name: configure run: cmake -B build . diff --git a/packages/kokkos/.github/workflows/continuous-integration-smoketest.yml b/packages/kokkos/.github/workflows/continuous-integration-smoketest.yml new file mode 100644 index 0000000000000000000000000000000000000000..75f03faa9922d265c2b9b8f9ce63ccead0b1af2d --- /dev/null +++ b/packages/kokkos/.github/workflows/continuous-integration-smoketest.yml @@ -0,0 +1,45 @@ +name: basic-test + +on: + workflow_call: + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}-cibasic + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +permissions: read-all + +jobs: + gcc-smoketest: + name: gcc-smoketest + runs-on: [ubuntu-latest] + + steps: + - name: checkout code + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + path: kokkos + + - name: configure kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_FLAGS="-Werror" \ + -DKokkos_ENABLE_COMPILER_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_TESTS=ON \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j4 install + + - name: test_kokkos + working-directory: kokkos/build + run: ctest --timeout 2000 -j2 --output-on-failure diff --git a/packages/kokkos/.github/workflows/continuous-integration-stager.yml b/packages/kokkos/.github/workflows/continuous-integration-stager.yml new file mode 100644 index 0000000000000000000000000000000000000000..9e89bc638161a34f593e5309213c19e9aac3ab94 --- /dev/null +++ b/packages/kokkos/.github/workflows/continuous-integration-stager.yml @@ -0,0 +1,65 @@ +name: staged-continuous-integration + +on: + push: + branches: + - develop + pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +permissions: read-all + +jobs: + # pre-stage + clang-format-check: + uses: ./.github/workflows/clang-format-check.yml + + cmake-format-check: + uses: ./.github/workflows/cmake-format-check.yml + + initial-check: + uses: ./.github/workflows/continuous-integration-smoketest.yml + + # primary testing + codeql: + needs: [initial-check, clang-format-check, cmake-format-check] + # need to set manual permissions since it requires security-events write + # but then we have to set everything else manually to read since the + # default is overwritten + permissions: + # required for all workflows + security-events: write + # only required for workflows in private repositories + actions: read + contents: read + uses: ./.github/workflows/codeql.yml + + windows-cuda: + needs: [initial-check, clang-format-check, cmake-format-check] + uses: ./.github/workflows/continuous-integration-windows.yml + + linux-x64: + needs: [initial-check, clang-format-check, cmake-format-check] + uses: ./.github/workflows/continuous-integration-linux.yml + + linux-x64-hpx: + needs: [initial-check, clang-format-check, cmake-format-check] + uses: ./.github/workflows/continuous-integration-linux-hpx.yml + + linux-x86: + needs: [initial-check, clang-format-check, cmake-format-check] + uses: ./.github/workflows/continuous-integration-linux-32bit.yml + + osx-x64: + needs: [initial-check, clang-format-check, cmake-format-check] + uses: ./.github/workflows/continuous-integration-osx.yml + + performance-test: + needs: [initial-check, clang-format-check, cmake-format-check] + uses: ./.github/workflows/performance-benchmark.yml diff --git a/packages/kokkos/.github/workflows/continuous-integration-windows.yml b/packages/kokkos/.github/workflows/continuous-integration-windows.yml new file mode 100644 index 0000000000000000000000000000000000000000..3d535bac5502b0538c74b7f16b8ff5f5af7e7195 --- /dev/null +++ b/packages/kokkos/.github/workflows/continuous-integration-windows.yml @@ -0,0 +1,34 @@ +name: github-windows + +on: + workflow_call: + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}-win-x64 + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +permissions: read-all + +jobs: + windows-cuda: + # Cuda build on Windows + name: Windows Cuda + runs-on: windows-2022 + + steps: + - uses: Jimver/cuda-toolkit@28e983fc6bf47e7a732934aa029a6738e2ce16e4 # v0.2.18 + id: cuda-toolkit + with: + cuda: '12.4.1' + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - name: configure + shell: bash + run: | + mkdir build + mkdir c:/project + cd build + cmake -DKokkos_ENABLE_CUDA=ON -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE=ON .. + - name: build library + shell: bash + run: | + cmake --build build --parallel 2 --config Release diff --git a/packages/kokkos/.github/workflows/performance-benchmark.yml b/packages/kokkos/.github/workflows/performance-benchmark.yml index 59eed4f6096fb81cc37c89a3fab8b7dee391c14e..c4ddf356ae98660521b1d9d428bdd5f79c1e3ceb 100644 --- a/packages/kokkos/.github/workflows/performance-benchmark.yml +++ b/packages/kokkos/.github/workflows/performance-benchmark.yml @@ -3,10 +3,9 @@ on: push: branches: - develop - pull_request: - paths-ignore: - - '**/*.md' - types: [ opened, reopened, synchronize ] + workflow_call: + +permissions: read-all jobs: CI: @@ -23,8 +22,8 @@ jobs: BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }} steps: - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }} @@ -53,7 +52,7 @@ jobs: find builddir/core/perf_test/ -name "*.json" -exec mv {} ${{ env.BUILD_ID }}/ \; - name: Push benchmark results if: ${{ github.ref == 'refs/heads/develop' }} - uses: dmnemec/copy_file_to_another_repo_action@main + uses: dmnemec/copy_file_to_another_repo_action@bbebd3da22e4a37d04dca5f782edd5201cb97083 # main env: API_TOKEN_GITHUB: ${{ secrets.DALG24_PUSH_BENCHMARK_RESULTS }} with: diff --git a/packages/kokkos/.github/workflows/releases.yml b/packages/kokkos/.github/workflows/releases.yml new file mode 100644 index 0000000000000000000000000000000000000000..ba5fcdc5387ed31d82c21d5ee014f93322917b70 --- /dev/null +++ b/packages/kokkos/.github/workflows/releases.yml @@ -0,0 +1,71 @@ +on: + push: + tags: '[0-9]+.[0-9]+.[0-9][0-9]' + + +permissions: read-all + +jobs: + # This step builds our artifacts, uploads them to the workflow run, and + # outputs their digest. + build: + outputs: + hashes: ${{ steps.hash.outputs.hashes }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - name: Build artifacts + run: | + git archive --prefix=kokkos-${{ github.ref_name }}/ -o kokkos-${{ github.ref_name }}.zip HEAD + git archive --prefix=kokkos-${{ github.ref_name }}/ -o kokkos-${{ github.ref_name }}.tar.gz HEAD + + - name: Generate hashes + shell: bash + id: hash + run: | + # sha256sum generates sha256 hash for all artifacts. + # base64 -w0 encodes to base64 and outputs on a single line. + sha256sum kokkos-${{ github.ref_name }}.zip kokkos-${{ github.ref_name }}.tar.gz > kokkos-${{ github.ref_name }}-SHA-256.txt + echo "hashes=$(base64 -w0 kokkos-${{ github.ref_name }}-SHA-256.txt)" >> "$GITHUB_OUTPUT" + + - name: Upload artifacts + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: release-artifacts + path: kokkos-${{ github.ref_name }}* + if-no-files-found: error + retention-days: 5 + + # This step calls the generic workflow to generate provenance. + provenance: + needs: [build] + permissions: + actions: read + id-token: write + contents: write + uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0 + with: + base64-subjects: "${{ needs.build.outputs.hashes }}" + # Upload provenance to a new release + upload-assets: true + provenance-name: "kokkos-${{ github.ref_name }}.intoto.jsonl" + + # This step uploads our artifacts to the tagged GitHub release. + release: + needs: [build, provenance] + permissions: + contents: write + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + steps: + - name: Download artifacts + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: release-artifacts + - name: Upload assets + uses: softprops/action-gh-release@c062e08bd532815e2082a85e87e3ef29c3e6d191 # v2.0.8 + with: + files: | + kokkos-${{ github.ref_name }}.zip + kokkos-${{ github.ref_name }}.tar.gz + kokkos-${{ github.ref_name }}-SHA-256.txt diff --git a/packages/kokkos/.github/workflows/scorecard.yml b/packages/kokkos/.github/workflows/scorecard.yml new file mode 100644 index 0000000000000000000000000000000000000000..b077642298c89a7bac4e38275d55c28191de3e67 --- /dev/null +++ b/packages/kokkos/.github/workflows/scorecard.yml @@ -0,0 +1,73 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + # Weekly on Saturdays. + - cron: '30 1 * * 6' + push: + branches: [ master, develop ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload SARIF results to code scanning" + uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 + with: + sarif_file: results.sarif diff --git a/packages/kokkos/.github/workflows/snl-ci.yml b/packages/kokkos/.github/workflows/snl-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..946589250fc335f2fbfe8b9fd1c6dfa0f0ecb92e --- /dev/null +++ b/packages/kokkos/.github/workflows/snl-ci.yml @@ -0,0 +1,55 @@ +name: SNL-CI + +on: + pull_request: + paths-ignore: + - '**/*.md' + types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + CUDA_12_2_CPP20: + name: SNL_CUDA_NVCC_12_2_CPP20 + runs-on: [snl-kk-env-cuda-12.2.0-gcc-11.3.0-latest] + + steps: + - name: checkout_kokkos + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + nvidia-smi + cd kokkos + cmake -B build \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ARCH_HOPPER90=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + ./ + + - name: build_and_install_kokkos + working-directory: kokkos + run: | + cmake --build build -j36 + cmake --install build --prefix install + + - name: test_kokkos + working-directory: kokkos/build + run: ctest --output-on-failure --timeout 3600 + diff --git a/packages/kokkos/.github/workflows/weekly-cea.yml b/packages/kokkos/.github/workflows/weekly-cea.yml new file mode 100644 index 0000000000000000000000000000000000000000..d04c54b212f440c6adf341a3542636c9aabcff12 --- /dev/null +++ b/packages/kokkos/.github/workflows/weekly-cea.yml @@ -0,0 +1,53 @@ +name: Weekly CEA builds + +on: + schedule: + - cron: "0 2 * * 6" # every Saturday at 2am UTC + workflow_dispatch: + +jobs: + build_and_test: + env: + build_jobs: 40 + + strategy: + matrix: + build_type: + - Release + - Debug + backend: + - name: cuda-a100 + flags: -DKokkos_ENABLE_CUDA=ON -DKokkos_ARCH_AMPERE80=ON + gpu: a100 + modules: gcc/11.2.0/gcc-4.8.5 cuda/12.2.1/gcc-11.2.0 cmake/3.28.3/gcc-11.2.0 + + runs-on: [self-hosted, cuda] + + steps: + - uses: actions/checkout@v4 + + - name: Configure + run: | + run \ + -m "${{ matrix.backend.modules }}" \ + cmake -B build \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + ${{ matrix.backend.flags }} + + - name: Build + run: | + run \ + -m "${{ matrix.backend.modules }}" \ + cmake --build build --parallel $build_jobs + + - name: Test + run: | + run \ + -g ${{ matrix.backend.gpu }} \ + -m "${{ matrix.backend.modules }}" \ + ctest --test-dir build --output-on-failure diff --git a/packages/kokkos/.gitlab/hpsf-gitlab-ci.yml b/packages/kokkos/.gitlab/hpsf-gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..ac1b7bb88639e22bc0df90d1b8aebf5d8d4d7f7d --- /dev/null +++ b/packages/kokkos/.gitlab/hpsf-gitlab-ci.yml @@ -0,0 +1,41 @@ +NVIDIA-GH200: + stage: test + tags: [nvidia-gh200] + image: masterleinad/kokkos-nvcc:12.6.1 + script: + - cmake + -B build + -DCMAKE_CXX_COMPILER=`pwd`/bin/nvcc_wrapper + -DCMAKE_CXX_FLAGS="-Werror=all-warnings -Werror" + -DKokkos_ARCH_HOPPER90=ON + -DKokkos_ENABLE_CUDA=ON + -DKokkos_ENABLE_COMPILER_WARNINGS=ON + -DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=ON + -DKokkos_ENABLE_TESTS=ON + - cmake --build build -j48 + - cd build + - ctest -V --output-junit result_gh200.xml + artifacts: + when: always + paths: + - build/result_gh200.xml + reports: + junit: build/result_gh200.xml + +INTEL-DATA-CENTER-MAX-1100: + stage: test + tags: [intel-data-center-max-1100] + image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 + script: + - sycl-ls + - export ONEAPI_DEVICE_SELECTOR=level_zero:gpu + - cmake -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=icpx -DKokkos_ENABLE_SYCL=ON -DKokkos_ARCH_INTEL_PVC=ON -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -fp-model=precise" + - cmake --build build -j48 + - cd build + - ctest -V --output-junit result_pvc1100.xml + artifacts: + when: always + paths: + - build/result_pvc1100.xml + reports: + junit: build/result_pvc1100.xml diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo index 0b83a99d68b0beaff367f2948bc7f17627237051..b704bf26e7ee08cc458924f222152a435084b0b4 100644 --- a/packages/kokkos/.gitrepo +++ b/packages/kokkos/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = git@github.com:kokkos/kokkos.git branch = master - commit = 71a9bcae52543bd065522bf3e41b5bfa467d8015 - parent = 04fc22a5b0c95070fe40c61fe2db5c6bdac384d5 + commit = 175257a51ff29a0059ec48bcd233ee096b2c0438 + parent = d10e19d8277c11520d02722976b353b8ccbcdee8 method = merge - cmdver = 0.4.6 + cmdver = 0.4.9 diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 6f5cf80033fa63b96c432a7f7d7ef2b8c582963c..5790ccb93ce117d5c90c87bf1b2287429194fecd 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -8,16 +8,21 @@ pipeline { } options { + disableConcurrentBuilds(abortPrevious: true) timeout(time: 6, unit: 'HOURS') } + triggers { + issueCommentTrigger('.*test this please.*') + } + stages { stage('Clang-Format') { agent { dockerfile { filename 'Dockerfile.clang' dir 'scripts/docker' - label 'nvidia-docker || rocm-docker || docker' + label 'nvidia-docker || docker' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } @@ -25,7 +30,157 @@ pipeline { sh './scripts/docker/check_format_cpp.sh' } } - stage('Build') { + stage('Build-1') { + parallel { + stage('GCC-8.4.0') { + agent { + dockerfile { + filename 'Dockerfile.gcc' + dir 'scripts/docker' + label 'docker' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_NESTED = 'true' + OMP_MAX_ACTIVE_LEVELS = 3 + OMP_PROC_BIND = 'true' + } + steps { + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_LIBDL=OFF \ + -DKokkos_ENABLE_LIBQUADMATH=ON \ + -DKokkos_ENABLE_SERIAL=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('HIP-ROCm-5.6-C++20') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:5.6-complete@sha256:578a310fb1037d9c5e23fded2564f239acf6dc7231ff4742d2e7279fe7cc5c4a' + label 'rocm-docker' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('CUDA-11.0-NVCC-RDC') { + agent { + dockerfile { + filename 'Dockerfile.nvcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.0.3-devel-ubuntu20.04@sha256:10ab0f09fcdc796b4a2325ef1bce8f766f4a3500eab5a83780f80475ae26c7a6 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' + label 'nvidia-docker && (volta || ampere)' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + environment { + OMP_NUM_THREADS = 8 + // Nested OpenMP does not work for this configuration, + // so disabling it + OMP_MAX_ACTIVE_LEVELS = 1 + OMP_PLACES = 'threads' + OMP_PROC_BIND = 'spread' + NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf install && mkdir -p install && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_OPENMP=OFF \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ + -DKokkos_ENABLE_CUDA_UVM=ON \ + -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + \ + -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ + .. && \ + make -j8 install && \ + cd .. && \ + rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ + export CMAKE_PREFIX_PATH=${PWD}/../install && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS="-Werror --Werror=all-warnings -Xcudafe --diag_suppress=940" \ + -DCMAKE_EXE_LINKER_FLAGS="-Xnvlink -suppress-stack-size-warning" \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_INSTALL_TESTING=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose && \ + cd ../example/build_cmake_installed && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + .. && \ + make -j8 && ctest --verbose && \ + cd ../.. && \ + cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ + cmake --build build_cmake_installed_different_compiler/build --target all && \ + cmake --build build_cmake_installed_different_compiler/build --target test''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build-tests/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + } + } + stage('Build-2') { parallel { stage('OPENACC-NVHPC-CUDA-12.2') { agent { @@ -44,16 +199,23 @@ pipeline { /opt/cmake/bin/cmake \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_OPENACC=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } } + } - stage('CUDA-12.2-NVHPC') { + stage('CUDA-12.2-NVHPC-AS-HOST-COMPILER') { agent { dockerfile { filename 'Dockerfile.nvhpc' @@ -77,19 +239,22 @@ pipeline { -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS="--diag_suppress=implicit_return_from_non_void_function,no_device_stack" \ + -DCMAKE_CXX_FLAGS="-Werror --diag_suppress=implicit_return_from_non_void_function" \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ENABLE_IMPL_MDSPAN=ON \ - -DKokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } } + } stage('SYCL-OneAPI') { agent { @@ -102,31 +267,32 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ - rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ + -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ + -DCMAKE_PREFIX_PATH="$ONE_DPL_DIR" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ARCH_AMPERE80=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_SYCL=ON \ + -DKokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=ON \ -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -135,8 +301,8 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete@sha256:4030c8af0c06c286174758523dabe4b3850bf72d4a8c1ef275d3ec69aa475f65' + label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -162,47 +328,15 @@ pipeline { -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_HIP=ON \ -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ -DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \ .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('HIP-ROCm-5.6-C++20') { - agent { - dockerfile { - filename 'Dockerfile.hipcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' - label 'rocm-docker && vega' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ - -DCMAKE_CXX_STANDARD=20 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_HIP=ON \ - .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -243,7 +377,7 @@ pipeline { -DKokkos_ARCH_AMD_GFX906=ON \ && \ cmake --build build --parallel ${BUILD_JOBS} && \ - cd build && ctest --output-on-failure + cd build && ctest --no-compress-output -T Test --output-on-failure ''' } post { @@ -272,7 +406,6 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_TUNING=ON \ @@ -280,11 +413,12 @@ pipeline { -DKokkos_ARCH_VOLTA70=ON \ -DCMAKE_CXX_STANDARD=17 \ .. && \ - make -j8 && ctest --verbose''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -310,121 +444,18 @@ pipeline { -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_TUNING=ON \ -DKokkos_ARCH_VOLTA70=ON \ .. && \ - make -j8 && ctest --verbose''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('CUDA-11.7-NVCC') { - agent { - dockerfile { - filename 'Dockerfile.nvcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.7.1-devel-ubuntu20.04' - label 'nvidia-docker && volta' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf build && mkdir -p build && cd build && \ - ../gnu_generate_makefile.bash \ - --with-options=compiler_warnings \ - --cxxflags="-Werror" \ - --cxxstandard=c++17 \ - --with-cuda \ - --with-cuda-options=enable_lambda \ - --arch=Volta70 \ - && \ - make test -j8''' - } - post { - always { - sh 'ccache --show-stats' - } - } - } - stage('CUDA-11.0-NVCC-RDC') { - agent { - dockerfile { - filename 'Dockerfile.nvcc' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' - label 'nvidia-docker' - args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' - } - } - environment { - OMP_NUM_THREADS = 8 - // Nested OpenMP does not work for this configuration, - // so disabling it - OMP_MAX_ACTIVE_LEVELS = 1 - OMP_PLACES = 'threads' - OMP_PROC_BIND = 'spread' - NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' - } - steps { - sh 'ccache --zero-stats' - sh '''rm -rf install && mkdir -p install && \ - rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=g++-8 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_OPENMP=OFF \ - -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ - -DKokkos_ENABLE_CUDA_UVM=ON \ - -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_IMPL_MDSPAN=ON \ - -DCMAKE_INSTALL_PREFIX=${PWD}/../install \ - .. && \ - make -j8 install && \ - cd .. && \ - rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ - export CMAKE_PREFIX_PATH=${PWD}/../install && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS=-Werror --Werror=all-warnings -Xcudafe --diag_suppress=3159 \ - -DCMAKE_CXX_STANDARD=17 \ - -DKokkos_INSTALL_TESTING=ON \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../example/build_cmake_installed && \ - rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_CXX_COMPILER=g++-8 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DCMAKE_CXX_STANDARD=17 \ - .. && \ - make -j8 && ctest --verbose && \ - cd ../.. && \ - cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ - cmake --build build_cmake_installed_different_compiler/build --target all && \ - cmake --build build_cmake_installed_different_compiler/build --target test''' + make -j8 && ctest --no-compress-output -T Test --verbose''' } post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } @@ -433,8 +464,8 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.2-devel-ubuntu20.04' - label 'nvidia-docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.6.2-devel-ubuntu20.04@sha256:d95d54bc231f8aea7fda79f60da620324584b20ed31a8ebdb0686cffd34dd405' + label 'nvidia-docker && (volta || ampere)' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } @@ -442,10 +473,11 @@ pipeline { sh 'ccache --zero-stats' sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ + -DBUILD_SHARED_LIBS=ON \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ - -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_FLAGS="-Werror -Werror=all-warnings" \ -DCMAKE_CXX_STANDARD=17 \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ @@ -455,12 +487,12 @@ pipeline { -DKokkos_ENABLE_TESTS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_CUDA=ON \ - -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ - -DKokkos_ENABLE_IMPL_MDSPAN=ON \ - -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ + -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=ON \ .. && \ - make -j8 && ctest --verbose && \ + make -j8 && ctest --no-compress-output -T Test --verbose && \ cd ../example/build_cmake_in_tree && \ rm -rf build && mkdir -p build && cd build && \ cmake -DCMAKE_CXX_STANDARD=17 .. && make -j8 && ctest --verbose''' @@ -468,42 +500,37 @@ pipeline { post { always { sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) } } } - stage('GCC-8.4.0') { + stage('CUDA-11.7-NVCC') { agent { dockerfile { - filename 'Dockerfile.gcc' + filename 'Dockerfile.nvcc' dir 'scripts/docker' - label 'docker' + additionalBuildArgs '--build-arg BASE=nvcr.io/nvidia/cuda:11.7.1-devel-ubuntu20.04@sha256:fc997521e612899a01dce92820f5f5a201dd943ebfdc3e49ba0706d491a39d2d' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } } - environment { - OMP_NUM_THREADS = 8 - OMP_NESTED = 'true' - OMP_MAX_ACTIVE_LEVELS = 3 - OMP_PROC_BIND = 'true' - } steps { + sh 'ccache --zero-stats' sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_CXX_FLAGS=-Werror \ - -DKokkos_ARCH_NATIVE=ON \ - -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_BENCHMARKS=ON \ - -DKokkos_ENABLE_OPENMP=ON \ - -DKokkos_ENABLE_LIBDL=OFF \ - -DKokkos_ENABLE_LIBQUADMATH=ON \ - -DKokkos_ENABLE_SERIAL=ON \ - .. && \ - make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c''' + ../gnu_generate_makefile.bash \ + --with-options=compiler_warnings \ + --cxxflags="-Werror -Werror=all-warnings" \ + --cxxstandard=c++17 \ + --with-cuda \ + --with-cuda-options=enable_lambda \ + --arch=Volta70 \ + && \ + make test -j8''' + } + post { + always { + sh 'ccache --show-stats' + } } } } diff --git a/packages/kokkos/.jenkins_nightly b/packages/kokkos/.jenkins_nightly index 5d5858178913b82827a5a3e90dd16e29e3cf4f2a..15bef607258fb38a74352bd74d347d9349c83049 100644 --- a/packages/kokkos/.jenkins_nightly +++ b/packages/kokkos/.jenkins_nightly @@ -70,37 +70,77 @@ pipeline { ''' } } - stage('GCC-13') { + stage('GCC-14') { agent { docker { - image 'gcc:13.1' + image 'gcc:14.1' label 'docker' } } steps { sh ''' - DEBIAN_FRONTEND=noninteractive && \ - apt-get update && apt-get upgrade -y && apt-get install -y \ - cmake \ - && \ - apt-get clean && rm -rf /var/lib/apt/lists/* + wget https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.sh && \ + chmod +x cmake-3.30.0-linux-x86_64.sh && ./cmake-3.30.0-linux-x86_64.sh --skip-license --prefix=/usr mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_STANDARD=23 \ + -DCMAKE_CXX_STANDARD=26 \ -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_BENCHMARKS=ON \ -DKokkos_ENABLE_EXAMPLES=ON \ -DKokkos_ENABLE_TESTS=ON \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_SERIAL=ON \ .. && \ - make -j8 && ctest --verbose + make -j8 && ctest --no-compress-output -T Test --verbose ''' } + post { + always { + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } + } + stage('HIP-ROCM-6.2') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-22.04:6.2-complete' + label 'rocm-docker && AMD_Radeon_Instinct_MI210' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + environment { + // FIXME Test returns a wrong value + GTEST_FILTER = '-hip_hostpinned.view_allocation_large_rank' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=20 \ + -DKokkos_ARCH_NATIVE=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_BENCHMARKS=ON \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && ctest --no-compress-output -T Test --verbose''' + } + post { + always { + sh 'ccache --show-stats' + xunit([CTest(deleteOutputFiles: true, failIfNotNew: true, pattern: 'build/Testing/**/Test.xml', skipNoTestFiles: false, stopProcessingIfError: true)]) + } + } } } } diff --git a/packages/kokkos/.olcf-gitlab-ci.yml b/packages/kokkos/.olcf-gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..4e737cc536ec763279a0beb48e563dac297ae853 --- /dev/null +++ b/packages/kokkos/.olcf-gitlab-ci.yml @@ -0,0 +1,12 @@ +test: + stage: test + tags: [frontier, shell] + id_tokens: + OLCF_ID_TOKEN: + aud: https://code.olcf.ornl.gov + script: + - module load rocm/6.0 + - cmake -B build -DCMAKE_CXX_COMPILER=hipcc -DKokkos_ENABLE_HIP=ON -DKokkos_ENABLE_TESTS=ON + - cmake --build build -j48 + - cd build + - ctest -E Kokkos_CoreUnitTest_DeviceAndThreads -V diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index c6115f4b3d27d187196cd341d74ed7a7d1ad4e5c..84bbd03585bd8ee74ae6dd3b75a1dd6966936580 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,5 +1,336 @@ # CHANGELOG +## 4.5.01 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.5.00...4.5.01) + +### Bug Fixes + +* Fix re-builds after cleaning the binary tree when doing `add_subdirectory` on the Kokkos source [\#7557](https://github.com/kokkos/kokkos/pull/7557) +* Update mdspan to include fix for submdspan and bracket operator with clang 15&16 [\#7559](https://github.com/kokkos/kokkos/pull/7559) +* Fix DynRankView performance regression by re-introducing shortcut operator() impls [\#7606](https://github.com/kokkos/kokkos/pull/7606) +* Add missing MI300A (`GFX942_APU`) option to Makefile build-system + +## 4.5.00 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) + +### Features + +* SYCL backend graduated to production ready +* Introduce new `SequentialHostInit` view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) (backported in 4.4.01) +* Support building with Run-Time Type Information (RTTI) disabled +* Add new `KOKKOS_RELOCATABLE_FUNCTION` function annotation macro [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +### Backend and Architecture Enhancements + +#### CUDA + +* Adding occupancy tuning for CUDA architectures [\#6788](https://github.com/kokkos/kokkos/pull/6788) +* By default disable `cudaMallocAsync` (i.e., revert the change made in version 4.2) [\#7353](https://github.com/kokkos/kokkos/pull/7353) + +#### HIP + +* Add support for AMD Phoenix APUs with Radeon 740M/760M/780M/880M/890M [\#7162](https://github.com/kokkos/kokkos/pull/7162) +* Update maximum waves per CU values for consumer card [\#7347](https://github.com/kokkos/kokkos/pull/7347) +* Check that Kokkos is running on the architecture it was compiled for [\#7379](https://github.com/kokkos/kokkos/pull/7379) +* Add opt-in option to use `hipMallocAsync` instead of `hipMalloc` [\#7324](https://github.com/kokkos/kokkos/pull/7324) +* Introduce new architecture option `AMD_GFX942_APU` for MI300A [\#7462](https://github.com/kokkos/kokkos/pull/7462) + +#### SYCL + +* Move the `SYCL` backend out of the `Experimental` namespace [\#7171](https://github.com/kokkos/kokkos/pull/7171) +* Introduce `KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE` as CMake option [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +#### OpenACC + +* Add support for building with the Clacc compiler [\#7198](https://github.com/kokkos/kokkos/pull/7198) +* Workaround NVHPC collapse clause bug for `MDRangePolicy` [\#7425](https://github.com/kokkos/kokkos/pull/7425) + +#### HPX + +* Implement `Experimental::partition_space` to produce truly independent execution spaces [\#7287](https://github.com/kokkos/kokkos/pull/7287) + +#### Threads + +* Fix compilation for `parallel_reduce` `MDRange` with `Dynamic` scheduling [\#7478](https://github.com/kokkos/kokkos/pull/7478) +* Fix race conditions on ARM architectures [\#7498](https://github.com/kokkos/kokkos/pull/7498) + +#### OpenMP + +* Fix run time behavior when compiling with `-fvisibility-hidden` [\#7284](https://github.com/kokkos/kokkos/pull/7284) (backported in 4.4.01) +* Fix linking with Cray Clang compiler [\#7341](https://github.com/kokkos/kokkos/pull/7341) + +#### Serial + +* Allow `Kokkos_ENABLE_ATOMICS_BYPASS` to skip mutexes to remediate performance regression in 4.4 [\#7369](https://github.com/kokkos/kokkos/pull/7369) + +### General Enhancements + +* Improve `View` initialization/destruction for non-scalar trivial and trivially-destructible types [\#7219](https://github.com/kokkos/kokkos/pull/7219) [\#7225](https://github.com/kokkos/kokkos/pull/7225) +* Add getters for default tile sizes used in `MDRangePolicy` [\#6839](https://github.com/kokkos/kokkos/pull/6839) +* Improve performance of `Kokkos::sort` when `std::sort` is used [\#7264](https://github.com/kokkos/kokkos/pull/7264) +* Add range-based for loop support for `Array<T, N>` [\#7293](https://github.com/kokkos/kokkos/pull/7293) +* Allow functors as reducers for nested team parallel reduce [\#6921](https://github.com/kokkos/kokkos/pull/6921) +* Avoid making copies of string rvalue reference arguments to `view_alloc()` [\#7364](https://github.com/kokkos/kokkos/pull/7364) +* Add `atomic_{mod,xor,nand,lshift,rshift}` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Allow using `SequentialHostInit` with `Kokkos::DualView` [\#7456](https://github.com/kokkos/kokkos/pull/7456) +* Add `Graph::instantiate()` [\#7240](https://github.com/kokkos/kokkos/pull/7240) +* Allow an arbitrary execution space instance to be used in `Kokkos::Graph::submit()` [\#7249](https://github.com/kokkos/kokkos/pull/7249) +* Enable compile-time diagnostic of illegal reduction target for graphs [\#7460](https://github.com/kokkos/kokkos/pull/7460) + +### Build System Changes + +* Make sure backend-specific options such as `IMPL_CUDA_MALLOC_ASYNC` only show when that backend is actually enabled [\#7228](https://github.com/kokkos/kokkos/pull/7228) +* Major refactoring removing `TriBITS` paths [\#6164](https://github.com/kokkos/kokkos/pull/6164) +* Add support for SpacemiT K60 (RISC-V) [\#7160](https://github.com/kokkos/kokkos/pull/7160) + +### Deprecations + +* Deprecate Tasking interface [\#7393](https://github.com/kokkos/kokkos/pull/7393) +* Deprecate `atomic_query_version`, `atomic_assign`, `atomic_compare_exchange_strong`, `atomic_{inc, dec}rement` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Deprecate `{OpenMP,HPX}::is_asynchronous()` [\#7322](https://github.com/kokkos/kokkos/pull/7322) + +### Bug Fixes + +* Fix undefined behavior in `BinSort` when sorting within bins on host [\#7223](https://github.com/kokkos/kokkos/pull/7223) +* Using CUDA limits to set extents for blocks, grids [\#7235](https://github.com/kokkos/kokkos/pull/7235) +* Fix `deep_copy (serial_exec, dst, src)` with multiple host backends [\#7245](https://github.com/kokkos/kokkos/pull/7245) +* Skip `RangePolicy` bounds conversion checks if roundtrip convertibility is not provided [\#7172](https://github.com/kokkos/kokkos/pull/7172) +* Allow extracting host and device views from `DualView` with `const` value type [\#7242](https://github.com/kokkos/kokkos/pull/7242) +* Fix `TeamPolicy` array reduction for CUDA and HIP [\#6296](https://github.com/kokkos/kokkos/pull/6296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix configuring without architecture flags for SYCL [\#7303](https://github.com/kokkos/kokkos/pull/7303) +* Set an initial value index during join of `MinLoc`, `MaxLoc` or `MinMaxLoc` [\#7330](https://github.com/kokkos/kokkos/pull/7330) +* Fix storage lifetime of driver for global launch of graph nodes for CUDA and HIP [\#7365](https://github.com/kokkos/kokkos/pull/7365) +* Make `value_type` for `RandomAccessIterator` non-`const` [\#7485](https://github.com/kokkos/kokkos/pull/7485) + +## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.00...4.4.01) + +### Features: +* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823) + +### Bug Fixes +* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) + +## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) + +### Features: +* Add `Kokkos::View` conversions from and to [`std::mdspan`](https://en.cppreference.com/w/cpp/container/mdspan) [\#6830](https://github.com/kokkos/kokkos/pull/6830) [\#7069](https://github.com/kokkos/kokkos/pull/7069) + +### Backend and Architecture Enhancements: + +#### CUDA: +* `nvcc_wrapper`: Adding ability to process `--disable-warnings` flag [\#6936](https://github.com/kokkos/kokkos/issues/6936) +* Use recommended/max team size functions in Cuda ParallelFor and Reduce constructors [\#6891](https://github.com/kokkos/kokkos/issues/6891) +* Improve compile-times when building with `Kokkos_ENABLE_DEBUG_BOUNDS_CHECK` in Cuda [\#7013](https://github.com/kokkos/kokkos/pull/7013) + +#### HIP: +* Use HIP builtin atomics [\#6882](https://github.com/kokkos/kokkos/pull/6882) [\#7000](https://github.com/kokkos/kokkos/pull/7000) +* Enable user-specified compiler and linker flags for AMD GPUs [\#7127](https://github.com/kokkos/kokkos/pull/7127) + +#### SYCL: +* Add support for Graphs [\#6912](https://github.com/kokkos/kokkos/pull/6912) +* Fix multi-GPU support [\#6887](https://github.com/kokkos/kokkos/pull/6887) +* Improve performance of reduction and scan operations [\#6562](https://github.com/kokkos/kokkos/pull/6562), [\#6750](https://github.com/kokkos/kokkos/pull/6750) +* Fix lock for guarding scratch space in `TeamPolicy` `parallel_reduce` [\#6988](https://github.com/kokkos/kokkos/pull/6988) +* Include submission command queue property information into `SYCL::print_configuration()` [\#7004](https://github.com/kokkos/kokkos/pull/7004) + +#### OpenACC: +* Make `TeamPolicy` `parallel_for` execute on the correct async queue [\#7012](https://github.com/kokkos/kokkos/pull/7012) + +#### OpenMPTarget: +* Honor user requested loop ordering in `MDRange` policy [\#6925](https://github.com/kokkos/kokkos/pull/6925) +* Prevent data races by guarding the scratch space used in `parallel_scan` [\#6998](https://github.com/kokkos/kokkos/pull/6998) + +#### HPX: +* Workaround issue with template argument deduction to support compilation with NVCC [\#7015](https://github.com/kokkos/kokkos/pull/7015) + +### General Enhancements +* Improve performance of view copies in host parallel regions [\#6730](https://github.com/kokkos/kokkos/pull/6730) +* Harmonize convertibility rules of `Kokkos::RandomAccessIterator` with `View`s [\#6929](https://github.com/kokkos/kokkos/pull/6929) +* Add a check precondition non-overlapping ranges for the `adjacent_difference` algorithm in debug mode [\#6922](https://github.com/kokkos/kokkos/pull/6922) +* Add deduction guides for `TeamPolicy` [\#7030](https://github.com/kokkos/kokkos/pull/7030) +* SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) +* Updates for `Kokkos::Array`: add `kokkos_swap(Array<T, N>)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) +* Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) +* Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow for portable user-defined deduction guides [\#6954](https://github.com/kokkos/kokkos/pull/6954) + +### Build System Changes +* Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) +* Update Intel GPU architectures in Makefile [\#6895](https://github.com/kokkos/kokkos/pull/6895) +* Fix use of OpenMP with Cuda or HIP as compile language [\#6972](https://github.com/kokkos/kokkos/pull/6972) +* Define and enforce new minimum compiler versions for C++20 support [\#7128](https://github.com/kokkos/kokkos/pull/7128), [\#7123](https://github.com/kokkos/kokkos/pull/7123) +* Add nvidia Grace CPU architecture: `Kokkos_ARCH_ARMV9_GRACE` [\#7158](https://github.com/kokkos/kokkos/pull/7158) +* Fix Makefile.kokkos for Threads [\#6896](https://github.com/kokkos/kokkos/pull/6896) +* Remove support for NVHPC as CUDA device compiler [\#6987](https://github.com/kokkos/kokkos/pull/6987) +* Fix using CUDAToolkit for CMake 3.28.4 and higher [\#7062](https://github.com/kokkos/kokkos/pull/7062) + +### Incompatibilities (i.e. breaking changes) +* Drop `Kokkos::Array` special treatment in `View`s [\#6906](https://github.com/kokkos/kokkos/pull/6906) +* Drop `Experimental::RawMemoryAllocationFailure` [\#7145](https://github.com/kokkos/kokkos/pull/7145) + +### Deprecations +* Remove `Experimental::LayoutTiled` class template and deprecate `is_layouttiled` trait [\#6907](https://github.com/kokkos/kokkos/pull/6907) +* Deprecate `Kokkos::layout_iterate_type_selector` [\#7076](https://github.com/kokkos/kokkos/pull/7076) +* Deprecate specialization of `Kokkos::pair` for a single element [\#6947](https://github.com/kokkos/kokkos/pull/6947) +* Deprecate `deep_copy` of `UnorderedMap` of different size [\#6812](https://github.com/kokkos/kokkos/pull/6812) +* Deprecate trailing `Proxy` template argument of `Kokkos::Array` [\#6934](https://github.com/kokkos/kokkos/pull/6934) +* Deprecate implicit conversions of integers to `ChunkSize` [\#7151](https://github.com/kokkos/kokkos/pull/7151) +* Deprecate implicit conversions to execution spaces [\#7156](https://github.com/kokkos/kokkos/pull/7156) + +### Bug Fixes +* Do not return a copy of the input functor in `Experimental::for_each` [\#6910](https://github.com/kokkos/kokkos/pull/6910) +* Fix `realloc` on views of non-default constructible element types [\#6993](https://github.com/kokkos/kokkos/pull/6993) +* Fix undefined behavior in `View` initialization or fill with zeros [\#7014](https://github.com/kokkos/kokkos/pull/7014) +* Fix `sort_by_key` on host execution spaces when building with NVCC [\#7059](https://github.com/kokkos/kokkos/pull/7059) +* Fix using shared libraries and -fvisibility=hidden [\#7065](https://github.com/kokkos/kokkos/pull/7065) +* Fix view reference counting when functor copy constructor throws in parallel dispatch [\#6289](https://github.com/kokkos/kokkos/pull/6289) +* Fix `initialize(InitializationSetting)` for handling `print_configuration` setting [\#7098](https://github.com/kokkos/kokkos/pull/7098) +* Thread safety fixes for the Serial and OpenMP backend [\#7080](https://github.com/kokkos/kokkos/pull/7080), [\#6151](https://github.com/kokkos/kokkos/pull/6151) + +## [4.3.01](https://github.com/kokkos/kokkos/tree/4.3.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.00...4.3.01) + +### Backend and Architecture Enhancements: + +#### HIP: +* MI300 support unified memory [\#6877](https://github.com/kokkos/kokkos/pull/6877) + +### Bug Fixes +* Serial: Use the provided execution space instance in TeamPolicy [\#6951](https://github.com/kokkos/kokkos/pull/6951) +* `nvcc_wrapper`: bring back support for `--fmad` option [\#6931](https://github.com/kokkos/kokkos/pull/6931) +* Fix CUDA reduction overflow for `RangePolicy` [\#6578](https://github.com/kokkos/kokkos/pull/6578) + +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6601) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `<Kokkos_Clamp.hpp>` and `<Kokkos_MinMax.hpp>` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + +## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746) + +#### HIP: +- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624) +- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671) + +### Build System +- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742) + +### Bug Fixes +- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653) +- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635) +- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659) +- Fix "extra `;`" warning with `-pedantic` flag in `<Kokkos_SIMD_Scalar.hpp>` [\#6510](https://github.com/kokkos/kokkos/pull/6510) + ## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00) @@ -43,7 +374,7 @@ #### SYCL: - Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246) -- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) +- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) - Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321) - Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322) - Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189) @@ -64,7 +395,7 @@ - Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474) -### Build System Changes +### Build System Changes - Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282) - Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342) @@ -978,95 +1309,95 @@ - Major update for OpenMPTarget: many capabilities now work. For details contact us. - Added DPC++/SYCL backend: primary capabilites are working. - Added Kokkos Graph API analogous to CUDA Graphs. -- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) -- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) -- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) -- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379) **Implemented enhancements Backends and Archs:** -- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) -- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) -- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) -- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) -- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) -- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) -- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) -- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) -- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) -- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) -- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) -- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) -- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) -- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) -- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366) **Implemented enhancements Policies:** -- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) -- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) -- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) -- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) -- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) -- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) -- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509) **Implemented enhancements BuildSystem:** -- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) -- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) -- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) -- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) -- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) -- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457) **Implemented enhancements Tools:** -- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) -- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) -- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) -- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) -- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326) **Implemented enhancements Other:** -- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) -- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) -- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) -- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) -- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) -- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) -- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) -- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) -- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) -- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) -- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) -- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) -- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) -- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) -- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941) **Fixed bugs:** -- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) -- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) -- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) -- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) -- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) -- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) -- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) -- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) -- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) -- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) -- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) -- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) -- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) -- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) -- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) -- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) -- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) -- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260) **Incompatibilities:** -- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) -- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) -- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) -- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) -- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148) ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) @@ -1163,7 +1494,7 @@ **Closed issues:** - Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) -- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Remove KOKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) - Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) - In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) - Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) diff --git a/packages/kokkos/CITATION.cff b/packages/kokkos/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..28c674c451bf74c9bc3e7ad0bf6dc1d4f258545d --- /dev/null +++ b/packages/kokkos/CITATION.cff @@ -0,0 +1,65 @@ +cff-version: 1.2.0 +title: Kokkos +message: >- + If you use this software, please cite the overview paper +type: software +authors: + - name: The Kokkos authors + website: https://kokkos.org/community/team/ +identifiers: + - type: url + website: https://kokkos.org/kokkos-core-wiki/citation.html +repository-code: 'https://github.com/kokkos/kokkos' +url: 'https://kokkos.org/' +license: Apache-2.0 +preferred-citation: + type: article + authors: + - given-names: Christian R. + family-names: Trott + - given-names: Damien + family-names: Lebrun-Grandié + - given-names: Daniel + family-names: Arndt + - family-names: Ciesko + given-names: Jan + - given-names: Vinh + family-names: Dang + - family-names: Ellingwood + given-names: Nathan + - given-names: Rahulkumar + family-names: Gayatri + - given-names: Evan + family-names: Harvey + - given-names: Daisy S. + family-names: Hollman + - given-names: Dan + family-names: Ibanez + - given-names: Nevin + family-names: Liber + - given-names: Jonathan + family-names: Madsen + - given-names: Jeff + family-names: Miles + - given-names: David + family-names: Poliakoff + - given-names: Amy + family-names: Powell + - given-names: Sivasankaran + family-names: Rajamanickam + - given-names: Mikael + family-names: Simberg + - given-names: Dan + family-names: Sunderland + - given-names: Bruno + family-names: Turcksin + - given-names: Jeremiah + family-names: Wilke + doi: 10.1109/TPDS.2021.3097283 + journal: IEEE Transactions on Parallel and Distributed Systems + start: 805 + end: 817 + title: "Kokkos 3: Programming Model Extensions for the Exascale Era" + volume: 33 + issue: 4 + year: 2022 diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index f6bd81058e9016b9c2f67a50ece0bef9c85e83f3..6a70bea1497367c259593ce9abe9a39a25acfba9 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -1,12 +1,11 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # Disable in-source builds to prevent source tree corruption. -if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) - message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) -endif() - -if (COMMAND TRIBITS_PACKAGE) - TRIBITS_PACKAGE(Kokkos) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message( + FATAL_ERROR + "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." + ) endif() # We want to determine if options are given with the wrong case @@ -15,143 +14,142 @@ endif() # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. -GET_CMAKE_PROPERTY(_variableNames VARIABLES) -SET(KOKKOS_GIVEN_VARIABLES) -FOREACH (var ${_variableNames}) - STRING(TOUPPER ${var} UC_VAR) - STRING(FIND ${UC_VAR} KOKKOS IDX) - IF (${IDX} EQUAL 0) - LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) - ENDIF() -ENDFOREACH() +get_cmake_property(_variableNames VARIABLES) +set(KOKKOS_GIVEN_VARIABLES) +foreach(var ${_variableNames}) + string(TOUPPER ${var} UC_VAR) + string(FIND ${UC_VAR} KOKKOS IDX) + if(${IDX} EQUAL 0) + list(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + endif() +endforeach() # Basic initialization (Used in KOKKOS_SETTINGS) -SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) - -# Is this a build as part of Trilinos? -IF(COMMAND TRIBITS_PACKAGE_DECL) - SET(KOKKOS_HAS_TRILINOS ON) -ELSE() - SET(KOKKOS_HAS_TRILINOS OFF) - SET(PACKAGE_NAME Kokkos) - SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -ENDIF() -# Is this build a subdirectory of another project -GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) +set(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(PACKAGE_NAME Kokkos) +set(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) +# Is this build a subdirectory of another project +get_directory_property(HAS_PARENT PARENT_DIRECTORY) -SET(KOKKOS_ENABLED_OPTIONS) #exported in config file -SET(KOKKOS_ENABLED_DEVICES) #exported in config file -SET(KOKKOS_ENABLED_TPLS) #exported in config file -SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file +include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) + +set(KOKKOS_ENABLED_OPTIONS) #exported in config file +set(KOKKOS_ENABLED_DEVICES) #exported in config file +set(KOKKOS_ENABLED_TPLS) #exported in config file +set(KOKKOS_ENABLED_ARCH_LIST) #exported in config file #These are helper flags used for sanity checks during config #Certain features should depend on other features being configured first -SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies -SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) -SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) -SET(KOKKOS_CFG_DAG_ARCH_DONE Off) -SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) -SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) -FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) - SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) - SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) - IF (NOT ${PRE_FLAG}) - MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") - ENDIF() - GLOBAL_SET(${POST_FLAG} On) -ENDFUNCTION() - - -LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) - -IF(NOT KOKKOS_HAS_TRILINOS) - set(CMAKE_DISABLE_SOURCE_CHANGES ON) - set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) - - # What language are we compiling Kokkos as - # downstream dependencies need to match this! - SET(KOKKOS_COMPILE_LANGUAGE CXX) - # use lower case here since we didn't parse options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as CUDA only - # because otherwise the C++ features don't work etc. - # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even - # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 - # days. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE CUDA) - ENDIF() - # use lower case here since we haven't parsed options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) - - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as HIP only - # because otherwise the C++ features don't work etc. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - - SET(KOKKOS_COMPILE_LANGUAGE HIP) - ENDIF() - - IF (Spack_WORKAROUND) - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") - ENDIF() - - #if we are explicitly using Spack for development, - #nuke the Spack compiler - SET(SPACK_CXX $ENV{SPACK_CXX}) - IF(SPACK_CXX) - SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) - SET(ENV{CXX} ${SPACK_CXX}) - ENDIF() - ENDIF() - # Always call the project command to define Kokkos_ variables - # and to make sure that C++ is an enabled language - PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) - IF(NOT HAS_PARENT) - IF (NOT CMAKE_BUILD_TYPE) - SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") - MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") - SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." - FORCE) - ENDIF() - ENDIF() -ELSE() - SET(KOKKOS_COMPILE_LANGUAGE CXX) -ENDIF() - -IF (NOT CMAKE_SIZEOF_VOID_P) - STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) - IF (NOT FIND_IDX STREQUAL -1) - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") - ENDIF() -ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") - SET(KOKKOS_IMPL_32BIT ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") - ENDIF() -ENDIF() +set(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +set(KOKKOS_CFG_DAG_DEVICES_DONE Off) +set(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +set(KOKKOS_CFG_DAG_ARCH_DONE Off) +set(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +set(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +function(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + set(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + set(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + if(NOT ${PRE_FLAG}) + message( + FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured" + ) + endif() + global_set(${POST_FLAG} On) +endfunction() + +list(APPEND CMAKE_MODULE_PATH cmake/Modules) + +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) + +# What language are we compiling Kokkos as +# downstream dependencies need to match this! +set(KOKKOS_COMPILE_LANGUAGE CXX) +# use lower case here since we didn't parse options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as CUDA only + # because otherwise the C++ features don't work etc. + # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even + # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 + # days. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + + set(KOKKOS_COMPILE_LANGUAGE CUDA) +endif() +# use lower case here since we haven't parsed options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) + + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as HIP only + # because otherwise the C++ features don't work etc. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE HIP) +endif() + +if(Spack_WORKAROUND) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + message(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") + endif() + + #if we are explicitly using Spack for development, + #nuke the Spack compiler + set(SPACK_CXX $ENV{SPACK_CXX}) + if(SPACK_CXX) + set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + set(ENV{CXX} ${SPACK_CXX}) + endif() +endif() +# Always call the project command to define Kokkos_ variables +# and to make sure that C++ is an enabled language +project(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) +if(NOT HAS_PARENT) + if(NOT CMAKE_BUILD_TYPE) + set(DEFAULT_BUILD_TYPE "RelWithDebInfo") + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" + CACHE STRING "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." FORCE + ) + endif() +endif() + +if(NOT CMAKE_SIZEOF_VOID_P) + string(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + if(NOT FIND_IDX STREQUAL -1) + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured." + ) + else() + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation" + ) + endif() +elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + message(WARNING "32-bit builds are experimental and not officially supported.") + set(KOKKOS_IMPL_32BIT ON) + else() + message( + FATAL_ERROR + "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;" + ) + endif() +endif() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 2) -set(Kokkos_VERSION_PATCH 0) +set(Kokkos_VERSION_MINOR 5) +set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -164,58 +162,54 @@ math(EXPR KOKKOS_VERSION_PATCH "${KOKKOS_VERSION} % 100") # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) +include(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # If we are building CUDA, we have tricked CMake because we declare a CXX project # If the default C++ standard for a given compiler matches the requested # standard, then CMake just omits the -std flag in later versions of CMake # This breaks CUDA compilation (CUDA compiler can have a different default # -std then the underlying host compiler by itself). Setting this variable # forces CMake to always add the -std flag even if it thinks it doesn't need it - GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) -ENDIF() + global_set(CMAKE_CXX_STANDARD_DEFAULT 98) +endif() # These are the variables we will append to as we go # I really wish these were regular variables # but scoping issues can make it difficult -GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) -GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) -GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +global_set(KOKKOS_COMPILE_OPTIONS) +global_set(KOKKOS_LINK_OPTIONS) +global_set(KOKKOS_AMDGPU_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDAFE_OPTIONS) +global_set(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos -GLOBAL_SET(KOKKOS_TPL_EXPORTS) +global_set(KOKKOS_TPL_EXPORTS) # KOKKOS_DEPENDENCE is used by kokkos_launch_compiler -GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +global_set(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # MSVC never goes through kokkos_launch_compiler -IF(NOT MSVC) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) -ENDIF() +if(NOT MSVC) + global_append(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +endif() + +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/kokkos_configure_trilinos.cmake) -IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) -ENDIF() +endif() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) - +include(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) # Check the environment and set certain variables # to allow platform-specific checks -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) -IF(NOT KOKKOS_HAS_TRILINOS) - # This does not work in Trilinos and we simply don't care - # to fix it for Trilinos - # Gather information about the runtime environment - INCLUDE(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) - check_git_setup() -ENDIF() +include(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) +check_git_setup() # The build environment setup goes in the following steps # 1) Check all the enable options. This includes checking Kokkos_DEVICES @@ -223,111 +217,54 @@ ENDIF() # 3) Check the CXX standard and select important CXX flags # 4) Check for any third-party libraries (TPLs) like hwloc # 5) Check if optimizing for a particular architecture and add arch-specific flags -KOKKOS_SETUP_BUILD_ENVIRONMENT() +kokkos_setup_build_environment() # Finish off the build # 6) Recurse into subdirectories and configure individual libraries # 7) Export and install targets -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) -SET(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) -SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) +set(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) +set_property(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) -IF (KOKKOS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSEIF(HAS_PARENT) - SET(KOKKOS_HEADER_DIR "include/kokkos") - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSE() - SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") - SET(KOKKOS_IS_SUBDIRECTORY FALSE) -ENDIF() +if(HAS_PARENT) + set(KOKKOS_HEADER_DIR "include/kokkos") + set(KOKKOS_IS_SUBDIRECTORY TRUE) +else() + set(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + set(KOKKOS_IS_SUBDIRECTORY FALSE) +endif() #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for # subpackages -## This restores the old behavior of ProjectCompilerPostConfig.cmake -# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos -# We must do this before KOKKOS_PACKAGE_DECL -IF (KOKKOS_HAS_TRILINOS) - # Overwrite the old flags at the top-level - # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior, also we have to preserve quotes - # which needs another workaround. - SET(KOKKOS_COMPILE_OPTIONS_TMP) - IF (KOKKOS_ENABLE_HIP) - LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) - ENDIF() - FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) - IF(OPTION_HAS_WHITESPACE EQUAL -1) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") - ELSE() - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") - ENDIF() - ENDFOREACH() - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) - IF (KOKKOS_ENABLE_CUDA) - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) - ENDIF() - FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) - SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) - ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}") - IF (KOKKOS_ENABLE_CUDA) - STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") - FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) - SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) - ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}") - ENDIF() - # Both parent scope and this package - # In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in - # TRILINOS_TOPLEVEL_CXX_FLAGS - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE) - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}") - #CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here - #These flags get set up in KOKKOS_PACKAGE_DECL, which means they - #must be configured before KOKKOS_PACKAGE_DECL - SET(KOKKOS_ALL_COMPILE_OPTIONS - $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_ALL_COMPILE_OPTIONS}>) -ENDIF() - - #------------------------------------------------------------------------------ # # D) Process the subpackages (subdirectories) for Kokkos # -KOKKOS_PROCESS_SUBPACKAGES() - +kokkos_process_subpackages() #------------------------------------------------------------------------------ # # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_POSTPROCESS() -KOKKOS_CONFIGURE_CORE() +kokkos_configure_core() -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - ADD_LIBRARY(kokkos INTERFACE) +if(NOT Kokkos_INSTALL_TESTING) + add_library(kokkos INTERFACE) #Make sure in-tree projects can reference this as Kokkos:: #to match the installed target names - ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + add_library(Kokkos::kokkos ALIAS kokkos) # all_libs target is required for TriBITS-compliance - ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos) - TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) -ENDIF() -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + add_library(Kokkos::all_libs ALIAS kokkos) + target_link_libraries(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) + kokkos_internal_add_library_install(kokkos) +endif() +include(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. # Kokkos needs nvcc_wrapper in order to build. Other libraries and @@ -336,16 +273,15 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # as relative to ${CMAKE_INSTALL_PATH}. # KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated # files -KOKKOS_INSTALL_ADDITIONAL_FILES() - +kokkos_install_additional_files() # Finally - if we are a subproject - make sure the enabled devices are visible -IF (HAS_PARENT) - FOREACH(DEV Kokkos_ENABLED_DEVICES) +if(HAS_PARENT) + foreach(DEV Kokkos_ENABLED_DEVICES) #I would much rather not make these cache variables or global properties, but I can't #make any guarantees on whether PARENT_SCOPE is good enough to make #these variables visible where I need them - SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) - SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) - ENDFOREACH() -ENDIF() + set(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + set_property(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + endforeach() +endif() diff --git a/packages/kokkos/CONTRIBUTING.md b/packages/kokkos/CONTRIBUTING.md index b4f3057cef2cb707b55b3ae3331000ff4e55c0e1..e97f8c4d89c5789bdf91567a689cbbc5aaabc4a7 100644 --- a/packages/kokkos/CONTRIBUTING.md +++ b/packages/kokkos/CONTRIBUTING.md @@ -7,6 +7,8 @@ We actively welcome pull requests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. +Before sending your patch for review, please try to ensure that it is formatted properly. We use clang-format version 16 for this. + ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. diff --git a/packages/kokkos/Copyright.txt b/packages/kokkos/Copyright.txt index 5e2f8d8647b53b8def2e240c92fdbad04b1550ec..cbba3efc7bc513887fbad0865e5f0c886b547ea2 100644 --- a/packages/kokkos/Copyright.txt +++ b/packages/kokkos/Copyright.txt @@ -1,41 +1,8 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER +************************************************************************ + + Kokkos v. 4.0 + Copyright (2022) National Technology & Engineering + Solutions of Sandia, LLC (NTESS). + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/packages/kokkos/HOW_TO_SNAPSHOT b/packages/kokkos/HOW_TO_SNAPSHOT deleted file mode 100644 index ad3f78efb4f8dd8399e3fb2889def7e841b531f9..0000000000000000000000000000000000000000 --- a/packages/kokkos/HOW_TO_SNAPSHOT +++ /dev/null @@ -1,73 +0,0 @@ - -Developers of Kokkos (those who commit modifications to Kokkos) -must maintain the snapshot of Kokkos in the Trilinos repository. - -This file contains instructions for how to -snapshot Kokkos from github.com/kokkos to Trilinos. - ------------------------------------------------------------------------- -*** EVERYTHING GOES RIGHT WORKFLOW *** - -1) Given a 'git clone' of Kokkos and of Trilinos repositories. -1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. - This path *must* terminate with the directory name 'kokkos'; - e.g., ${HOME}/kokkos . -1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. - -2) Given that the Kokkos build & test is clean and - changes are committed to the Kokkos clone. - -3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. - This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: - ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages - -4) Verify the snapshot commit happened as expected - cd ${TRILINOS}/packages/kokkos - git log -1 --name-only - -5) Modify, build, and test Trilinos with the Kokkos snapshot. - -6) Given that that the Trilinos build & test is clean and - changes are committed to the Trilinos clone. - -7) Attempt push to the Kokkos repository. - If push fails then you must 'remove the Kokkos snapshot' - from your Trilinos clone. - See below. - -8) Attempt to push to the Trilinos repository. - If updating for a failed push requires you to change Kokkos you must - 'remove the Kokkos snapshot' from your Trilinos clone. - See below. - ------------------------------------------------------------------------- -*** WHEN SOMETHING GOES WRONG AND YOU MUST *** -*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** - -1) Query the Trilinos clone commit log. - git log --oneline - -2) Note the <SHA1> of the commit to the Trillinos clone - immediately BEFORE the Kokkos snapshot commit. - Copy this <SHA1> for use in the next command. - -3) IF more than one outstanding commit then you can remove just the - Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. - Remove or comment out the Kokkos snapshot commit entry. - git rebase -i <SHA1> - -4) IF the Kokkos snapshot commit is the one and only - outstanding commit then remove just than commit. - git reset --hard HEAD~1 - ------------------------------------------------------------------------- -*** REGARDING 'snapshot.py' TOOL *** - -The 'snapshot.py' tool is developed and maintained by the -Center for Computing Research (CCR) -Software Engineering, Maintenance, and Support (SEMS) team. - -Contact Brent Perschbacher <bmpersc@sandia.gov> for questions> - ------------------------------------------------------------------------- - diff --git a/packages/kokkos/LICENSE b/packages/kokkos/LICENSE index 6572cc2db055e848e53f82c024d75b3404ef0bf8..4d9d69d7c44591139614a52b79eb16de49acaf0b 100644 --- a/packages/kokkos/LICENSE +++ b/packages/kokkos/LICENSE @@ -1,13 +1,3 @@ - ************************************************************************ - - Kokkos v. 4.0 - Copyright (2022) National Technology & Engineering - Solutions of Sandia, LLC (NTESS). - - Under the terms of Contract DE-NA0003525 with NTESS, - the U.S. Government retains certain rights in this software. - - ============================================================================== Kokkos is under the Apache License v2.0 with LLVM Exceptions: ============================================================================== diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index 7137ec3936cc3d4d961e9c349a15a2aae7906167..f67eadf241f38ce9c542e477a24c232c957043b8 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 2 -KOKKOS_VERSION_PATCH = 0 +KOKKOS_VERSION_MINOR = 5 +KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -11,15 +11,15 @@ KOKKOS_DEVICES ?= "Threads" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 -# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX -# IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100 +# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace +# IBM: Power8,Power9 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX942_APU,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 -# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC +# Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt,experimental_memkind +# Options: hwloc KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -30,23 +30,26 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,enable_malloc_async KOKKOS_CUDA_OPTIONS ?= "" -# Options: rdc +# Options: rdc,enable_malloc_async KOKKOS_HIP_OPTIONS ?= "" # Default settings specific options. # Options: enable_async_dispatch KOKKOS_HPX_OPTIONS ?= "" +#Options : force_host_as_device +KOKKOS_OPENACC_OPTIONS ?= "" + # Helper functions for conversion to upper case uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) # Return a 1 if a string contains a substring and 0 if not # Note the search string should be without '"' -# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc) # Will return a 1 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) # Returns 1 if the path exists, 0 otherwise @@ -63,11 +66,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23) KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b) +KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26) +KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c) # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) -KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) -KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -82,7 +85,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) -KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) +KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -93,6 +96,8 @@ KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPT KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) +KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),enable_malloc_async) +KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE := $(call kokkos_has_string,$(KOKKOS_OPENACC_OPTIONS),force_host_as_device) # Check for Kokkos Host Execution Spaces one of which must be on. KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) @@ -168,7 +173,7 @@ KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2 KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_NVHPC := $(strip $(shell $(CXX) --version 2>&1 | grep -c "nvc++")) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -v "error:" | grep -c "clang++")) KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) @@ -282,6 +287,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) # Set OpenACC flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) KOKKOS_INTERNAL_OPENACC_FLAG := -acc + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENACC_FLAG := -fopenacc -fopenacc-fake-async-wait -fopenacc-implicit-worker=vector -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version -Wno-pass-failed else $(error Makefile.kokkos: OpenACC is enabled but the compiler must be NVHPC (got version string $(KOKKOS_CXX_VERSION))) endif @@ -308,7 +315,6 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) @@ -319,12 +325,43 @@ KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL) KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX) KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR) -KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) +# Traditionally, we supported, e.g., IntelGen9 instead of Intel_Gen9. The latter +# matches the CMake option but we also accept the former for backward-compatibility. KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen11) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen12LP) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen9) +endif +KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11) \ + + $(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP)) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN_SET), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_Gen) + endif +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_DG1) +endif KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 0) + KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),Intel_XeHP) +endif +# Traditionally the architecture was called PVC instead of Intel_PVC. This +# version makes us accept IntelPVC and Intel_PVC as well. KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC) # NVIDIA based. @@ -371,8 +408,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) @@ -385,14 +422,13 @@ KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2) KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) -KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) +KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv9-Grace) +KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) -KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) @@ -403,20 +439,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942_APU) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) +KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -504,6 +568,9 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE") + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -561,6 +628,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23") endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) @@ -600,27 +677,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") endif -ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") - KOKKOS_LIBS += -lrt - KOKKOS_TPL_LIBRARY_NAMES += rt -endif - -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - ifneq ($(KOKKOS_CMAKE), yes) - ifneq ($(MEMKIND_PATH),) - KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include - KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib - KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include - KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib - endif - KOKKOS_LIBS += -lmemkind -lnuma - KOKKOS_TPL_LIBRARY_NAMES += memkind numa - endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif @@ -687,11 +743,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") - endif - - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + ifeq ($(KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") @@ -748,6 +800,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV9_GRACE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV9_GRACE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARM_NEON") + + KOKKOS_CXXFLAGS += -mcpu=neoverse-v2 -msve-vector-bits=128 + KOKKOS_LDFLAGS += -mcpu=neoverse-v2 -msve-vector-bits=128 +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") @@ -815,20 +875,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) endif endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") - - ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) - KOKKOS_CXXFLAGS += -xSSE4.2 - KOKKOS_LDFLAGS += -xSSE4.2 - else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - else - # Assume that this is a really a GNU compiler. - KOKKOS_CXXFLAGS += -msse4.2 - KOKKOS_LDFLAGS += -msse4.2 - endif -endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") @@ -988,86 +1034,122 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--offload-arch + endif +endif + # Do not add this flag if its the cray compiler or the nvhpc compiler. ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 endif endif @@ -1083,6 +1165,9 @@ ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif endif endif @@ -1090,33 +1175,48 @@ endif # Figure out the architecture flag for ROCm. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx906\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx906 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx908\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx908 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx90A\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx90a +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx940\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx940 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942_APU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1030 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1100\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1100 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1103\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1103 endif @@ -1125,8 +1225,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) - KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") @@ -1136,6 +1236,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_CXXFLAGS+=-fno-gpu-rdc KOKKOS_LDFLAGS+=-fno-gpu-rdc endif + + ifeq ($(KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC */") + endif +endif + +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 0) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + endif + endif endif # Figure out Intel architecture flags. @@ -1189,6 +1304,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) KOKKOS_LDFLAGS+=-fsycl KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) + + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) @@ -1210,6 +1327,8 @@ ifeq ($(KOKKOS_INTERNAL_DISABLE_BUNDLED_MDSPAN), 0) endif tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_MDSPAN") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY") + KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) @@ -1232,7 +1351,22 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") + ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp") + endif + ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp") + endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp") @@ -1252,33 +1386,15 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp") - endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENACC.hpp>","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENACC.hpp>","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) - tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp") - endif - ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/MDSpan/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) @@ -1331,6 +1447,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENACC_LIB) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifneq ($(CUDA_PATH),) + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + CUDA_PATH := $(CUDA_PATH:/compilers=/cuda) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(CUDA_PATH),) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + endif + KOKKOS_LIBS += -lcudart + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_LIBS += -cuda + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(ROCM_PATH),) + KOKKOS_CPPFLAGS += -I$(ROCM_PATH)/include + KOKKOS_LDFLAGS += -L$(ROCM_PATH)/lib + endif + KOKKOS_LIBS += -lamdhip64 + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=multicore + endif + else + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=gpu,multicore + endif + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -1386,11 +1544,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) KOKKOS_TPL_LIBRARY_NAMES += hpx endif -# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. -ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) -endif - # With Cygwin functions such as fdopen and fileno are not defined # when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects @@ -1444,6 +1597,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") + endif +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") @@ -1468,6 +1631,12 @@ $(DESUL_CONFIG_HEADER): KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) +# Tasking is deprecated +ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + TMP_KOKKOS_SRC := $(KOKKOS_SRC) + KOKKOS_SRC = $(patsubst %Task.cpp,, $(TMP_KOKKOS_SRC)) +endif + KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) @@ -1476,7 +1645,7 @@ include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ - KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets index ec8770dd7de048f66333f31b97454fa9f89c3db7..be535eea3e7c4663f89393d77004ae67e54a208d 100644 --- a/packages/kokkos/Makefile.targets +++ b/packages/kokkos/Makefile.targets @@ -16,12 +16,8 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp @@ -30,8 +26,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp -Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp @@ -42,17 +36,21 @@ Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort. ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +endif Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -77,13 +75,17 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_ZeroMemset.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) -Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp +Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -91,26 +93,26 @@ Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_Ope $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) -Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp -Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) @@ -123,6 +125,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp endif - -Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md index 033346e956e040db030fdb334287bd5a71c5fafa..56159b35c29bbbbabdb5f8d687e2e3bd4c63120c 100644 --- a/packages/kokkos/README.md +++ b/packages/kokkos/README.md @@ -1,4 +1,4 @@ - +[](https://kokkos.org) # Kokkos: Core Libraries @@ -10,48 +10,71 @@ hierarchies and multiple types of execution resources. It currently can use CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other backends in development. -**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.** +**Kokkos Core is part of the [Kokkos C++ Performance Portability Programming Ecosystem](https://kokkos.org/about/abstract/).** -For the complete documentation, click below: +Kokkos is a [Linux Foundation](https://linuxfoundation.org) project. -# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki) - -# Learning about Kokkos +## Learning about Kokkos To start learning about Kokkos: -- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities. +- [Kokkos Lectures](https://kokkos.org/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important capabilities. -- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. +- [Programming guide](https://kokkos.org/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. -- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html). +- [API reference](https://kokkos.org/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.org/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.org/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.org/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.org/kokkos-core-wiki/API/alphabetical.html). -- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. +- [Use cases and Examples](https://kokkos.org/kokkos-core-wiki/usecases.html): a serie of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. +## Obtaining Kokkos -For non-public questions send an email to: *crtrott(at)sandia.gov* +The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). + +The current release is [4.5.01](https://github.com/kokkos/kokkos/releases/tag/4.5.01). + +```bash +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.tar.gz +# Or with wget +wget https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.tar.gz +``` + +To clone the latest development version of Kokkos from GitHub: + +```bash +git clone -b develop https://github.com/kokkos/kokkos.git +``` -# Contributing to Kokkos +### Building Kokkos -Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute. +To build Kokkos, you will need to have a C++ compiler that supports C++17 or later. +All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.org/kokkos-core-wiki/requirements.html). -# Requirements, Building and Installing +Building and installation instructions are described [here](https://kokkos.org/kokkos-core-wiki/building.html). + +You can also install Kokkos using [Spack](https://spack.io/): `spack install kokkos`. [Available configuration options](https://packages.spack.io/package.html?name=kokkos) can be displayed using `spack info kokkos`. + +## For the complete documentation: [kokkos.org/kokkos-core-wiki/](https://kokkos.org/kokkos-core-wiki/) + +## Support + +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. + +For non-public questions send an email to: *crtrott(at)sandia.gov* -All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html). +## Contributing -Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html). +Please see [this page](https://kokkos.org/kokkos-core-wiki/contributing.html) for details on how to contribute. -# Citing Kokkos +## Citing Kokkos -Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html). +Please see the [following page](https://kokkos.org/kokkos-core-wiki/citation.html). -# License +## License -[](https://opensource.org/licenses/BSD-3-Clause) +[](https://spdx.org/licenses/LLVM-exception.html) Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. -The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or -[here](https://github.com/kokkos/kokkos/blob/master/LICENSE). +The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or +[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE). diff --git a/packages/kokkos/SECURITY.md b/packages/kokkos/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..93cf6e3663e46d549020fdd708aac3c91a369437 --- /dev/null +++ b/packages/kokkos/SECURITY.md @@ -0,0 +1,12 @@ +# Reporting Security Issues + +To report a security issue, please email +[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov) +and [crtrott@sandia.gov](mailto:crtrott@sandia.gov) +with a description of the issue, the steps you took to create the issue, +affected versions, and, if known, mitigations for the issue. + +Our vulnerability management team will respond within 5 working days of your +email. If the issue is confirmed as a vulnerability, we will open a +Security Advisory and acknowledge your contributions as part of it. This project +follows a 90 day disclosure timeline. diff --git a/packages/kokkos/Spack.md b/packages/kokkos/Spack.md index 79606c259d5b5e840ff40240de4fc1087bad2c4d..06c763a64ee0dcad76007cd2d581fe915a7532a9 100644 --- a/packages/kokkos/Spack.md +++ b/packages/kokkos/Spack.md @@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you > spack install superscience ```` you may end up just getting the default Kokkos (i.e. Serial). -Some examples are included in the `config/yaml` folder for common platforms. Before running `spack install <package>` we recommend running `spack spec <package>` to confirm your dependency tree is correct. For example, with Kokkos Kernels: ````bash diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt index 368984647e9fbe3b3d9b5aa4dfe01457edbbd52c..73ce9f7ec5526a7f074ae6bfcc0da033965c83fd 100644 --- a/packages/kokkos/algorithms/CMakeLists.txt +++ b/packages/kokkos/algorithms/CMakeLists.txt @@ -1,7 +1,7 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) - KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -ENDIF() +if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) + kokkos_add_test_directories(unit_tests) +endif() diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt index 169577894728a3b75111c3d2398a0c015950fb1f..9f10b85e0214e91af75d11203cb2a8c387ca57b9 100644 --- a/packages/kokkos/algorithms/src/CMakeLists.txt +++ b/packages/kokkos/algorithms/src/CMakeLists.txt @@ -1,34 +1,29 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB ALGO_HEADERS *.hpp) -FILE(GLOB ALGO_SOURCES *.cpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) +file(GLOB ALGO_HEADERS *.hpp) +file(GLOB ALGO_SOURCES *.cpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_INTERFACE_LIBRARY( - kokkosalgorithms - NOINSTALLHEADERS ${ALGO_HEADERS} - SOURCES ${ALGO_SOURCES} +kokkos_add_interface_library(kokkosalgorithms NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES}) +kokkos_lib_include_directories( + kokkosalgorithms ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} -) - - +kokkos_link_tpl(kokkoscontainers PUBLIC ROCTHRUST) +kokkos_link_tpl(kokkoscore PUBLIC ONEDPL) diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp index 2d7d236d2fc2df0967a49768ba14a97d1a63b917..b28ea4c2ca9a686c35b7f81acf4d4b4f30b0d5fc 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -615,7 +615,7 @@ template <class DeviceType> struct Random_UniqueIndex { using locks_view_type = View<int**, DeviceType>; KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type) { + static int get_state_idx(const locks_view_type&) { KOKKOS_IF_ON_HOST( (return DeviceType::execution_space::impl_hardware_thread_id();)) @@ -665,17 +665,16 @@ struct Random_UniqueIndex< #ifdef KOKKOS_ENABLE_SYCL template <class MemorySpace> -struct Random_UniqueIndex< - Kokkos::Device<Kokkos::Experimental::SYCL, MemorySpace>> { +struct Random_UniqueIndex<Kokkos::Device<Kokkos::SYCL, MemorySpace>> { using locks_view_type = - View<int**, Kokkos::Device<Kokkos::Experimental::SYCL, MemorySpace>>; + View<int**, Kokkos::Device<Kokkos::SYCL, MemorySpace>>; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; std::size_t gridDim[3] = { @@ -849,18 +848,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1094,18 +1092,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1123,7 +1120,7 @@ class Random_XorShift1024_Pool { using execution_space = typename device_type::execution_space; using locks_type = View<int**, device_type>; using int_view_type = View<int**, device_type>; - using state_data_type = View<uint64_t * [16], device_type>; + using state_data_type = View<uint64_t* [16], device_type>; locks_type locks_ = {}; state_data_type state_ = {}; @@ -1545,13 +1542,23 @@ template <class ViewType, class RandomPool, class IndexType = int64_t> void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin, typename ViewType::const_value_type end) { - fill_random(typename ViewType::execution_space{}, a, g, begin, end); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, begin, end); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } template <class ViewType, class RandomPool, class IndexType = int64_t> void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { - fill_random(typename ViewType::execution_space{}, a, g, 0, range); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, 0, range); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } } // namespace Kokkos diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp index f77484cc5559dedb93b6b3ab65139c80e12180dc..136b4ec82dcd3d755d8a175195db9c955fd9f187 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -23,6 +23,7 @@ #include "sorting/Kokkos_BinSortPublicAPI.hpp" #include "sorting/Kokkos_SortPublicAPI.hpp" +#include "sorting/Kokkos_SortByKeyPublicAPI.hpp" #include "sorting/Kokkos_NestedSortPublicAPI.hpp" #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT diff --git a/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp b/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp index 436ae0d10bf88bcd4643cf0255bb442fe1d1bec4..b532a774e13089dd6a878780102075dad5a2d53a 100644 --- a/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp @@ -35,7 +35,6 @@ // following the std classification. // modifying ops -#include "std_algorithms/Kokkos_Swap.hpp" #include "std_algorithms/Kokkos_IterSwap.hpp" // non-modifying sequence diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 73e751f572c5e866f03d93ce54f1b109e50a1ea2..8e7de32a07b2fb2fb92f126005a0e2acd81b9871 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -35,11 +35,11 @@ struct BinOp1D { #endif // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), + : max_bins_(max_bins + 1), // Cast to double to avoid possible overflow when using integer - mul_(static_cast<double>(max_bins__) / + mul_(static_cast<double>(max_bins) / (static_cast<double>(max) - static_cast<double>(min))), min_(static_cast<double>(min)) { // For integral types the number of bins may be larger than the range @@ -47,7 +47,7 @@ struct BinOp1D { // and then don't need to sort bins. if (std::is_integral<typename KeyViewType::const_value_type>::value && (static_cast<double>(max) - static_cast<double>(min)) <= - static_cast<double>(max_bins__)) { + static_cast<double>(max_bins)) { mul_ = 1.; } } @@ -82,16 +82,16 @@ struct BinOp3D { BinOp3D() = delete; #endif - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast<double>(max_bins__[0]) / + max_bins_[0] = max_bins[0]; + max_bins_[1] = max_bins[1]; + max_bins_[2] = max_bins[2]; + mul_[0] = static_cast<double>(max_bins[0]) / (static_cast<double>(max[0]) - static_cast<double>(min[0])); - mul_[1] = static_cast<double>(max_bins__[1]) / + mul_[1] = static_cast<double>(max_bins[1]) / (static_cast<double>(max[1]) - static_cast<double>(min[1])); - mul_[2] = static_cast<double>(max_bins__[2]) / + mul_[2] = static_cast<double>(max_bins[2]) / (static_cast<double>(max[2]) - static_cast<double>(min[2])); min_[0] = static_cast<double>(min[0]); min_[1] = static_cast<double>(min[1]); diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index c399279fe48ff2b322b19cf2c9cc68d1477f8a7e..f417b6b13b3cb3adf0728a7b56956c35ff3cdb23 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -388,7 +388,8 @@ class BinSort { // reasonable experimentally. if (use_std_sort && bin_size > 10) { KOKKOS_IF_ON_HOST( - (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + (std::sort(sort_order.data() + lower_bound, + sort_order.data() + upper_bound, [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fc73eccad68c64b5218ec5fb821790f8c08568a9 --- /dev/null +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ +#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortByKeyImpl.hpp" +#include <Kokkos_Core.hpp> +#include <algorithm> + +namespace Kokkos::Experimental { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template <class ExecutionSpace, class KeysDataType, class... KeysProperties, + class ValuesDataType, class... ValuesProperties> +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values) { + // constraints + using KeysType = Kokkos::View<KeysDataType, KeysProperties...>; + using ValuesType = Kokkos::View<ValuesDataType, ValuesProperties...>; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility<ExecutionSpace, + typename KeysType::memory_space>::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility<ExecutionSpace, + typename ValuesType::memory_space>::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys, + values); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- + +template <class ExecutionSpace, class ComparatorType, class KeysDataType, + class... KeysProperties, class ValuesDataType, + class... ValuesProperties> +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + const ComparatorType& comparator) { + // constraints + using KeysType = Kokkos::View<KeysDataType, KeysProperties...>; + using ValuesType = Kokkos::View<ValuesDataType, ValuesProperties...>; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility<ExecutionSpace, + typename KeysType::memory_space>::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility<ExecutionSpace, + typename ValuesType::memory_space>::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values, + comparator); +} + +} // namespace Kokkos::Experimental +#endif diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index a763c41e580701aff17417be77c153577e984c80..20026c77e4155874b6d3ddf182ff8d703a4b4195 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -29,7 +29,7 @@ namespace Kokkos { // --------------------------------------------------------------- template <class ExecutionSpace, class DataType, class... Properties> -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View<DataType, Properties...>& view) { // constraints using ViewType = Kokkos::View<DataType, Properties...>; @@ -52,9 +52,14 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) { - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last); + exec.fence("Kokkos::sort without comparator use std::sort"); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size()); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } } else { Impl::sort_device_view_without_comparator(exec, view); } @@ -82,7 +87,7 @@ void sort(const Kokkos::View<DataType, Properties...>& view) { // --------------------------------------------------------------- template <class ExecutionSpace, class ComparatorType, class DataType, class... Properties> -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View<DataType, Properties...>& view, const ComparatorType& comparator) { // constraints @@ -105,9 +110,14 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v<ExecutionSpace>) { - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last, comparator); + exec.fence("Kokkos::sort with comparator use std::sort"); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size(), comparator); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } } else { Impl::sort_device_view_with_comparator(exec, view, comparator); } diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp index 50ac82331957f186de8aaa1135ee34bb37ece83e..2fe58272d9201ad187d0962945712c341147947f 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp @@ -18,7 +18,6 @@ #define KOKKOS_NESTED_SORT_IMPL_HPP_ #include <Kokkos_Core.hpp> -#include <std_algorithms/Kokkos_Swap.hpp> namespace Kokkos { namespace Experimental { @@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl( keyView(elem1) = key2; keyView(elem2) = key1; if constexpr (!std::is_same_v<ValueViewType, std::nullptr_t>) { - Kokkos::Experimental::swap(valueView(elem1), valueView(elem2)); + Kokkos::kokkos_swap(valueView(elem1), valueView(elem2)); } } } diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2a8f761d9b4f1802db0b823b195c5ef5167c22a8 --- /dev/null +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -0,0 +1,422 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ + +#include <Kokkos_Core.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#pragma pop_macro("_CubLog") +#else +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) && \ + (ONEDPL_VERSION_MAJOR > 2022 || \ + (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) +#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY +#include <oneapi/dpl/execution> +#include <oneapi/dpl/algorithm> +#endif + +namespace Kokkos::Impl { + +template <typename T> +constexpr inline bool is_admissible_to_kokkos_sort_by_key = + ::Kokkos::is_view<T>::value && T::rank() == 1 && + (std::is_same_v<typename T::traits::array_layout, Kokkos::LayoutLeft> || + std::is_same_v<typename T::traits::array_layout, Kokkos::LayoutRight> || + std::is_same_v<typename T::traits::array_layout, Kokkos::LayoutStride>); + +template <class ViewType> +KOKKOS_INLINE_FUNCTION constexpr void +static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { + static_assert(is_admissible_to_kokkos_sort_by_key<ViewType>, + "Kokkos::sort_by_key only accepts 1D values View with " + "LayoutRight, LayoutLeft or LayoutStride."); +} + +// For the fallback implementation for sort_by_key using Kokkos::sort, we need +// to consider if Kokkos::sort defers to the fallback implementation that copies +// the array to the host and uses std::sort, see +// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If +// sort_on_device_v is true, we assume that std::sort doesn't copy data. +// Otherwise, we manually copy all data to the host and provide Kokkos::sort +// with a host execution space. +template <class ExecutionSpace, class Layout> +inline constexpr bool sort_on_device_v = false; + +#if defined(KOKKOS_ENABLE_CUDA) +template <class Layout> +inline constexpr bool sort_on_device_v<Kokkos::Cuda, Layout> = true; + +template <class KeysDataType, class... KeysProperties, class ValuesDataType, + class... ValuesProperties, class... MaybeComparator> +void sort_by_key_cudathrust( + const Kokkos::Cuda& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::cuda::par.on(exec.cuda_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward<MaybeComparator>(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template <class Layout> +inline constexpr bool sort_on_device_v<Kokkos::HIP, Layout> = true; + +template <class KeysDataType, class... KeysProperties, class ValuesDataType, + class... ValuesProperties, class... MaybeComparator> +void sort_by_key_rocthrust( + const Kokkos::HIP& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::hip::par.on(exec.hip_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward<MaybeComparator>(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template <class Layout> +inline constexpr bool sort_on_device_v<Kokkos::SYCL, Layout> = + std::is_same_v<Layout, Kokkos::LayoutLeft> || + std::is_same_v<Layout, Kokkos::LayoutRight>; + +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY +template <class KeysDataType, class... KeysProperties, class ValuesDataType, + class... ValuesProperties, class... MaybeComparator> +void sort_by_key_onedpl( + const Kokkos::SYCL& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + MaybeComparator&&... maybeComparator) { + if (keys.stride(0) != 1 && values.stride(0) != 1) { + Kokkos::abort( + "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1."); + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = keys.extent(0); + oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(), + std::forward<MaybeComparator>(maybeComparator)...); +} +#endif +#endif + +template <typename ExecutionSpace, typename PermutationView, typename ViewType> +void applyPermutation(const ExecutionSpace& space, + const PermutationView& permutation, + const ViewType& view) { + static_assert(std::is_integral_v<typename PermutationView::value_type>); + + auto view_copy = Kokkos::create_mirror( + Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, + Kokkos::WithoutInitializing), + view); + Kokkos::deep_copy(space, view_copy, view); + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::permute_" + view.label(), + Kokkos::RangePolicy<ExecutionSpace>(space, 0, view.extent(0)), + KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); +} + +// FIXME_NVCC: nvcc has trouble compiling lambdas inside a function with +// variadic templates (sort_by_key_via_sort). Switch to using functors instead. +template <typename Permute> +struct IotaFunctor { + Permute _permute; + KOKKOS_FUNCTION void operator()(int i) const { _permute(i) = i; } +}; +template <typename Keys> +struct LessFunctor { + Keys _keys; + KOKKOS_FUNCTION bool operator()(int i, int j) const { + return _keys(i) < _keys(j); + } +}; + +// FIXME_NVCC+MSVC: We can't use a lambda instead of a functor which gave us +// "For this host platform/dialect, an extended lambda cannot be defined inside +// the 'if' or 'else' block of a constexpr if statement" +template <typename Keys, typename Comparator> +struct KeyComparisonFunctor { + Keys m_keys; + Comparator m_comparator; + KOKKOS_FUNCTION bool operator()(int i, int j) const { + return m_comparator(m_keys(i), m_keys(j)); + } +}; + +template <class ExecutionSpace, class KeysDataType, class... KeysProperties, + class ValuesDataType, class... ValuesProperties, + class... MaybeComparator> +void sort_by_key_via_sort( + const ExecutionSpace& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + MaybeComparator&&... maybeComparator) { + static_assert(sizeof...(MaybeComparator) <= 1); + + auto const n = keys.size(); + + Kokkos::View<unsigned int*, ExecutionSpace> permute( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "Kokkos::sort_by_key_via_sort::permute"), + n); + + // iota + Kokkos::parallel_for("Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n), + IotaFunctor<decltype(permute)>{permute}); + + using Layout = + typename Kokkos::View<unsigned int*, ExecutionSpace>::array_layout; + if constexpr (!sort_on_device_v<ExecutionSpace, Layout>) { + auto host_keys = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + auto host_permute = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + permute); + Kokkos::deep_copy(exec, host_keys, keys); + Kokkos::deep_copy(exec, host_permute, permute); + + exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort"); + Kokkos::DefaultHostExecutionSpace host_exec; + + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort(host_exec, host_permute, + LessFunctor<decltype(host_keys)>{host_keys}); + } else { + auto keys_comparator = + std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...)); + Kokkos::sort( + host_exec, host_permute, + KeyComparisonFunctor<decltype(host_keys), decltype(keys_comparator)>{ + host_keys, keys_comparator}); + } + host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); + Kokkos::deep_copy(exec, permute, host_permute); + } else { +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys.data(); + auto stride = keys.stride(0); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); + } else { + auto keys_comparator = + std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); + } +#else + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort(exec, permute, LessFunctor<decltype(keys)>{keys}); + } else { + auto keys_comparator = + std::get<0>(std::tuple<MaybeComparator...>(maybeComparator...)); + Kokkos::sort( + exec, permute, + KeyComparisonFunctor<decltype(keys), decltype(keys_comparator)>{ + keys, keys_comparator}); + } +#endif + } + + applyPermutation(exec, permute, keys); + applyPermutation(exec, permute, values); +} + +// ------------------------------------------------------ +// +// specialize cases for sorting by key without comparator +// +// ------------------------------------------------------ + +#if defined(KOKKOS_ENABLE_CUDA) +template <class KeysDataType, class... KeysProperties, class ValuesDataType, + class... ValuesProperties> +void sort_by_key_device_view_without_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values) { + sort_by_key_cudathrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template <class KeysDataType, class... KeysProperties, class ValuesDataType, + class... ValuesProperties> +void sort_by_key_device_view_without_comparator( + const Kokkos::HIP& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values) { + sort_by_key_rocthrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template <class KeysDataType, class... KeysProperties, class ValuesDataType, + class... ValuesProperties> +void sort_by_key_device_view_without_comparator( + const Kokkos::SYCL& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values); + else +#endif + sort_by_key_via_sort(exec, keys, values); +} +#endif + +// fallback case +template <class ExecutionSpace, class KeysDataType, class... KeysProperties, + class ValuesDataType, class... ValuesProperties> +std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> +sort_by_key_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values) { + sort_by_key_via_sort(exec, keys, values); +} + +// --------------------------------------------------- +// +// specialize cases for sorting by key with comparator +// +// --------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template <class ComparatorType, class KeysDataType, class... KeysProperties, + class ValuesDataType, class... ValuesProperties> +void sort_by_key_device_view_with_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + const ComparatorType& comparator) { + sort_by_key_cudathrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template <class ComparatorType, class KeysDataType, class... KeysProperties, + class ValuesDataType, class... ValuesProperties> +void sort_by_key_device_view_with_comparator( + const Kokkos::HIP& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + const ComparatorType& comparator) { + sort_by_key_rocthrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template <class ComparatorType, class KeysDataType, class... KeysProperties, + class ValuesDataType, class... ValuesProperties> +void sort_by_key_device_view_with_comparator( + const Kokkos::SYCL& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + const ComparatorType& comparator) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values, comparator); + else +#endif + sort_by_key_via_sort(exec, keys, values, comparator); +} +#endif + +// fallback case +template <class ComparatorType, class ExecutionSpace, class KeysDataType, + class... KeysProperties, class ValuesDataType, + class... ValuesProperties> +std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> +sort_by_key_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View<KeysDataType, KeysProperties...>& keys, + const Kokkos::View<ValuesDataType, ValuesProperties...>& values, + const ComparatorType& comparator) { + sort_by_key_via_sort(exec, keys, values, comparator); +} + +#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + +} // namespace Kokkos::Impl +#endif diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index d87ab09e7724b87ceaa1cace387e57952bf5ddd6..734ce450f69e1f347776c42b4221056bce10ff82 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -34,6 +34,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -63,6 +64,11 @@ #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) #include <oneapi/dpl/execution> #include <oneapi/dpl/algorithm> @@ -141,7 +147,7 @@ void sort_via_binsort(const ExecutionSpace& exec, bool sort_in_bins = true; // TODO: figure out better max_bins then this ... int64_t max_bins = view.extent(0) / 2; - if (std::is_integral<typename ViewType::non_const_value_type>::value) { + if (std::is_integral_v<typename ViewType::non_const_value_type>) { // Cast to double to avoid possible overflow when using integer auto const max_val = static_cast<double>(result.max_val); auto const min_val = static_cast<double>(result.min_val); @@ -152,7 +158,7 @@ void sort_via_binsort(const ExecutionSpace& exec, sort_in_bins = false; } } - if (std::is_floating_point<typename ViewType::non_const_value_type>::value) { + if (std::is_floating_point_v<typename ViewType::non_const_value_type>) { KOKKOS_ASSERT(std::isfinite(static_cast<double>(result.max_val) - static_cast<double>(result.min_val))); } @@ -184,13 +190,33 @@ void sort_cudathrust(const Cuda& space, } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template <class DataType, class... Properties, class... MaybeComparator> +void sort_rocthrust(const HIP& space, + const Kokkos::View<DataType, Properties...>& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View<DataType, Properties...>; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::hip::par.on(space.hip_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward<MaybeComparator>(maybeComparator)...); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template <class DataType, class... Properties, class... MaybeComparator> -void sort_onedpl(const Kokkos::Experimental::SYCL& space, +void sort_onedpl(const Kokkos::SYCL& space, const Kokkos::View<DataType, Properties...>& view, MaybeComparator&&... maybeComparator) { using ViewType = Kokkos::View<DataType, Properties...>; - static_assert(SpaceAccessibility<Kokkos::Experimental::SYCL, + static_assert(SpaceAccessibility<Kokkos::SYCL, typename ViewType::memory_space>::accessible, "SYCL execution space is not able to access the memory space " "of the View argument!"); @@ -243,19 +269,29 @@ void copy_to_host_run_stdsort_copy_back( KE::copy(exec, view, view_dc); // run sort on the mirror of view_dc - auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); - auto first = KE::begin(mv_h); - auto last = KE::end(mv_h); - std::sort(first, last, std::forward<MaybeComparator>(maybeComparator)...); + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + if (view.span_is_contiguous()) { + std::sort(mv_h.data(), mv_h.data() + mv_h.size(), + std::forward<MaybeComparator>(maybeComparator)...); + } else { + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward<MaybeComparator>(maybeComparator)...); + } Kokkos::deep_copy(exec, view_dc, mv_h); // copy back to argument view KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); } else { auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); - auto first = KE::begin(view_h); - auto last = KE::end(view_h); - std::sort(first, last, std::forward<MaybeComparator>(maybeComparator)...); + if (view.span_is_contiguous()) { + std::sort(view_h.data(), view_h.data() + view_h.size(), + std::forward<MaybeComparator>(maybeComparator)...); + } else { + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward<MaybeComparator>(maybeComparator)...); + } Kokkos::deep_copy(exec, view, view_h); } } @@ -274,10 +310,18 @@ void sort_device_view_without_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template <class DataType, class... Properties> +void sort_device_view_without_comparator( + const HIP& exec, const Kokkos::View<DataType, Properties...>& view) { + sort_rocthrust(exec, view); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template <class DataType, class... Properties> void sort_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View<DataType, Properties...>& view) { using ViewType = Kokkos::View<DataType, Properties...>; static_assert( @@ -320,11 +364,19 @@ void sort_device_view_with_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template <class ComparatorType, class DataType, class... Properties> +void sort_device_view_with_comparator( + const HIP& exec, const Kokkos::View<DataType, Properties...>& view, + const ComparatorType& comparator) { + sort_rocthrust(exec, view, comparator); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template <class ComparatorType, class DataType, class... Properties> void sort_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, - const Kokkos::View<DataType, Properties...>& view, + const Kokkos::SYCL& exec, const Kokkos::View<DataType, Properties...>& view, const ComparatorType& comparator) { using ViewType = Kokkos::View<DataType, Properties...>; static_assert( @@ -355,11 +407,16 @@ sort_device_view_with_comparator( // and then copies data back. Potentially, this can later be changed // with a better solution like our own quicksort on device or similar. +// Note with HIP unified memory this code path is still the right thing to do +// if we end up here when RocThrust is not enabled. +// The create_mirror_view_and_copy will do the right thing (no copy). +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY using ViewType = Kokkos::View<DataType, Properties...>; using MemSpace = typename ViewType::memory_space; static_assert(!SpaceAccessibility<HostSpace, MemSpace>::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); +#endif copy_to_host_run_stdsort_copy_back(exec, view, comparator); } diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp index b7ce1ba5edb335d2b66ec4d2f5abb0f2fb4ef552..c5406c72b0d84701f1a6cc7843838262fdab0914 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto copy( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp index 8f9e0f19b80837ac12efa84f6ebbad75fd08871c..82071a9362eb4d7c7a6a4acce4488b8eae9e8ecd 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy_backward(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto copy_backward( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp index ba18bc76b93682131e082e8dd90fad97607b14c6..599fde5737ae886bee1a36a324f538418d0639f5 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp @@ -54,7 +54,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy_if(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) { + const ::Kokkos::View<DataType2, Properties2...>& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -69,7 +70,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) { + const ::Kokkos::View<DataType2, Properties2...>& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -96,7 +98,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto copy_if( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) { + const ::Kokkos::View<DataType2, Properties2...>& dest, Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp index 43c91204837e6e695229ea90173e868a5d125d69..637d8d4cbc5180b57b918a70e594e1000f3891d1 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp @@ -51,7 +51,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy_n(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, Size count, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto copy_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, Size count, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto copy_n( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, Size count, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp index a72a49cc22b827c24f9c2f3e92c6507b2f1c0508..593c42f87e13443912c34d649bfe3cba0b080f6e 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp @@ -80,7 +80,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2) { + const ::Kokkos::View<DataType2, Properties2...>& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -96,7 +96,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2) { + const ::Kokkos::View<DataType2, Properties2...>& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -111,7 +111,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2, + const ::Kokkos::View<DataType2, Properties2...>& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -128,7 +128,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2, + const ::Kokkos::View<DataType2, Properties2...>& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -227,7 +227,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION bool equal( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2) { + const ::Kokkos::View<DataType2, Properties2...>& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -243,7 +243,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION bool equal( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2, + const ::Kokkos::View<DataType2, Properties2...>& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp index 6215b325afc79c92dcd0355cd05ca0856943846f..05969be463a56173dcb1185ff85314bcedddc44d 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ForEach.hpp @@ -29,49 +29,46 @@ namespace Experimental { template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_exespace_impl(label, ex, first, last, - std::move(functor)); +void for_each(const std::string& label, const ExecutionSpace& ex, + IteratorType first, IteratorType last, UnaryFunctorType functor) { + Impl::for_each_exespace_impl(label, ex, first, last, std::move(functor)); } template < class ExecutionSpace, class IteratorType, class UnaryFunctorType, std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, IteratorType first, - IteratorType last, UnaryFunctorType functor) { - return Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", - ex, first, last, std::move(functor)); +void for_each(const ExecutionSpace& ex, IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_exespace_impl("Kokkos::for_each_iterator_api_default", ex, + first, last, std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> -UnaryFunctorType for_each(const std::string& label, const ExecutionSpace& ex, - const ::Kokkos::View<DataType, Properties...>& v, - UnaryFunctorType functor) { +void for_each(const std::string& label, const ExecutionSpace& ex, + const ::Kokkos::View<DataType, Properties...>& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl(label, ex, KE::begin(v), KE::end(v), + std::move(functor)); } template < class ExecutionSpace, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> -UnaryFunctorType for_each(const ExecutionSpace& ex, - const ::Kokkos::View<DataType, Properties...>& v, - UnaryFunctorType functor) { +void for_each(const ExecutionSpace& ex, + const ::Kokkos::View<DataType, Properties...>& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, - KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_exespace_impl("Kokkos::for_each_view_api_default", ex, + KE::begin(v), KE::end(v), std::move(functor)); } // @@ -82,24 +79,23 @@ UnaryFunctorType for_each(const ExecutionSpace& ex, template <class TeamHandleType, class IteratorType, class UnaryFunctorType, std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0> -KOKKOS_FUNCTION UnaryFunctorType for_each(const TeamHandleType& teamHandle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { - return Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { + Impl::for_each_team_impl(teamHandle, first, last, std::move(functor)); } template <class TeamHandleType, class DataType, class... Properties, class UnaryFunctorType, std::enable_if_t<Kokkos::is_team_handle_v<TeamHandleType>, int> = 0> -KOKKOS_FUNCTION UnaryFunctorType -for_each(const TeamHandleType& teamHandle, - const ::Kokkos::View<DataType, Properties...>& v, - UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each(const TeamHandleType& teamHandle, + const ::Kokkos::View<DataType, Properties...>& v, + UnaryFunctorType functor) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); namespace KE = ::Kokkos::Experimental; - return Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), - std::move(functor)); + Impl::for_each_team_impl(teamHandle, KE::begin(v), KE::end(v), + std::move(functor)); } } // namespace Experimental diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index a796a306dda0af270977550a4095cd63be5e86b4..5bb2d1039dc660089c46f5891a8473aafc57936d 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -19,7 +19,6 @@ #include <Kokkos_Core.hpp> #include "impl/Kokkos_Constraints.hpp" -#include "Kokkos_Swap.hpp" namespace Kokkos { namespace Experimental { @@ -33,7 +32,7 @@ struct StdIterSwapFunctor { KOKKOS_FUNCTION void operator()(int i) const { (void)i; - ::Kokkos::Experimental::swap(*m_a, *m_b); + ::Kokkos::kokkos_swap(*m_a, *m_b); } KOKKOS_FUNCTION @@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template <class T> +KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") +KOKKOS_FUNCTION + void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval<T&>(), + std::declval<T&>())) { + ::Kokkos::kokkos_swap(a, b); +} +#endif + } // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp index 4b5c69df4512e5f514b5b72aaaf1da431cfa77b4..e13479c370b82b8df271f950c2295deb909291f8 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp @@ -54,7 +54,7 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2) { + const ::Kokkos::View<DataType2, Properties2...>& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -71,7 +71,7 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2) { + const ::Kokkos::View<DataType2, Properties2...>& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -112,7 +112,8 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) { + const ::Kokkos::View<DataType2, Properties2...>& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -129,7 +130,8 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) { + const ::Kokkos::View<DataType2, Properties2...>& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -161,7 +163,7 @@ template <class TeamHandleType, class DataType1, class... Properties1, KOKKOS_FUNCTION bool lexicographical_compare( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2) { + const ::Kokkos::View<DataType2, Properties2...>& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -187,7 +189,8 @@ template <class TeamHandleType, class DataType1, class... Properties1, KOKKOS_FUNCTION bool lexicographical_compare( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& view1, - ::Kokkos::View<DataType2, Properties2...>& view2, ComparatorType comp) { + const ::Kokkos::View<DataType2, Properties2...>& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp index f04ea12ba88a92c6f4bab3412e4733c9a210133c..ac308ea1845cb565b0f62cfbf81d77f6f5090755 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto move(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto move(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto move( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp index 375474ca57f956504fea1f10a08c3dbfd0a47615..2789ab21796837258fb848aa18090f27925215b2 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp @@ -41,7 +41,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto move_backward(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto move_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto move_backward( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index b84f00f8bb500f23e53326ffb1b460ead9eb8276..ea7e55ca619068b0e1a41d9f9dd6b0cb8203531a 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -91,7 +91,7 @@ template <typename ExecutionSpace, typename IteratorType, typename ValueType, int> = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl( @@ -105,7 +105,7 @@ template <typename ExecutionSpace, typename IteratorType, typename ValueType, ValueType reduce(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, @@ -119,7 +119,7 @@ template <typename ExecutionSpace, typename DataType, typename... Properties, ValueType reduce(const ExecutionSpace& ex, const ::Kokkos::View<DataType, Properties...>& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -137,7 +137,7 @@ template <typename ExecutionSpace, typename DataType, typename... Properties, ValueType reduce(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType, Properties...>& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -157,7 +157,7 @@ template <typename ExecutionSpace, typename IteratorType, typename ValueType, ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -172,7 +172,7 @@ template <typename ExecutionSpace, typename IteratorType, typename ValueType, ValueType reduce(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -186,7 +186,7 @@ template <typename ExecutionSpace, typename DataType, typename... Properties, ValueType reduce(const ExecutionSpace& ex, const ::Kokkos::View<DataType, Properties...>& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -204,7 +204,7 @@ template <typename ExecutionSpace, typename DataType, typename... Properties, ValueType reduce(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType, Properties...>& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -258,7 +258,7 @@ template < KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::reduce_default_functors_team_impl(teamHandle, first, last, @@ -273,7 +273,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View<DataType, Properties...>& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -294,7 +294,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, @@ -309,7 +309,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View<DataType, Properties...>& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp index 37336c983ab0e53d0a2172b5a7605e0edd75ea39..66f39c4eaa6094aa51d522937ac7010ba53b1976 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto reverse_copy(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v<ExecutionSpace>, int> = 0> auto reverse_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto reverse_copy( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp index 39f33b64879a2509edf9d7ee111755df3b6f31a6..d66763d304c42567320653a9d21a8ab3e19fc872 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp @@ -40,7 +40,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1, std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0> auto swap_ranges(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1, std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0> auto swap_ranges(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto swap_ranges( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest) { + const ::Kokkos::View<DataType2, Properties2...>& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp index 838c9169e25c4411c22259b8ef5a21b5f964a4fe..84cbed524d371c1c523b591d5d5eeb9349e596e2 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp @@ -58,7 +58,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1, std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest, + const ::Kokkos::View<DataType2, Properties2...>& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -73,7 +73,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1, std::enable_if_t<is_execution_space_v<ExecutionSpace>, int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest, + const ::Kokkos::View<DataType2, Properties2...>& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -119,7 +119,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1, auto transform(const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source1, const ::Kokkos::View<DataType2, Properties2...>& source2, - ::Kokkos::View<DataType3, Properties3...>& dest, + const ::Kokkos::View<DataType3, Properties3...>& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -137,7 +137,7 @@ template <typename ExecutionSpace, typename DataType1, typename... Properties1, auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View<DataType1, Properties1...>& source1, const ::Kokkos::View<DataType2, Properties2...>& source2, - ::Kokkos::View<DataType3, Properties3...>& dest, + const ::Kokkos::View<DataType3, Properties3...>& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -174,7 +174,8 @@ template <typename TeamHandleType, typename DataType1, typename... Properties1, KOKKOS_FUNCTION auto transform( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source, - ::Kokkos::View<DataType2, Properties2...>& dest, UnaryOperation unary_op) { + const ::Kokkos::View<DataType2, Properties2...>& dest, + UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform( const TeamHandleType& teamHandle, const ::Kokkos::View<DataType1, Properties1...>& source1, const ::Kokkos::View<DataType2, Properties2...>& source2, - ::Kokkos::View<DataType3, Properties3...>& dest, + const ::Kokkos::View<DataType3, Properties3...>& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index 101f5113f68a406a2aa59c0542ff5f2cfe24f9dc..89585ddbea0c14b3ae59cf401ce183a9fcf42c30 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -117,7 +117,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -136,7 +136,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -157,7 +157,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -182,7 +182,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -208,7 +208,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, IteratorType last1, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -228,7 +228,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -248,7 +248,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -270,7 +270,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -345,7 +345,7 @@ KOKKOS_FUNCTION ValueType transform_reduce( const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -366,7 +366,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -393,7 +393,7 @@ KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -412,7 +412,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible<ValueType>::value, + static_assert(std::is_move_constructible_v<ValueType>, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp index a8171fa068d1290f8842956f888165e087894cad..9f7fcf94fe0b8391413cb8ff892c9af900794d63 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_AdjacentDifference.hpp @@ -82,6 +82,11 @@ OutputIteratorType adjacent_difference_exespace_impl( return first_dest; } +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); @@ -114,6 +119,11 @@ KOKKOS_FUNCTION OutputIteratorType adjacent_difference_team_impl( return first_dest; } +#ifdef KOKKOS_ENABLE_DEBUG + // check for overlapping iterators + Impl::expect_no_overlap(first_from, last_from, first_dest); +#endif + // run const auto num_elements = Kokkos::Experimental::distance(first_from, last_from); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 27ce5a6fad6eb89124b2dc13462102852171cf8c..da16141f5a7f7762b569b79c87e0d727e6c0bf77 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -24,18 +24,21 @@ namespace Kokkos { namespace Experimental { namespace Impl { +template <class T> +class RandomAccessIterator; + template <typename T, typename enable = void> struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template <typename T> struct is_admissible_to_kokkos_std_algorithms< - T, std::enable_if_t< ::Kokkos::is_view<T>::value && T::rank() == 1 && - (std::is_same<typename T::traits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename T::traits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename T::traits::array_layout, - Kokkos::LayoutStride>::value)> > + T, std::enable_if_t<::Kokkos::is_view<T>::value && T::rank() == 1 && + (std::is_same_v<typename T::traits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename T::traits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename T::traits::array_layout, + Kokkos::LayoutStride>)>> : std::true_type {}; template <class ViewType> @@ -58,6 +61,18 @@ using is_iterator = Kokkos::is_detected<iterator_category_t, T>; template <class T> inline constexpr bool is_iterator_v = is_iterator<T>::value; +template <typename ViewType> +struct is_kokkos_iterator : std::false_type {}; + +template <typename ViewType> +struct is_kokkos_iterator<RandomAccessIterator<ViewType>> { + static constexpr bool value = + is_admissible_to_kokkos_std_algorithms<ViewType>::value; +}; + +template <class T> +inline constexpr bool is_kokkos_iterator_v = is_kokkos_iterator<T>::value; + // // are_iterators // @@ -87,8 +102,8 @@ struct are_random_access_iterators; template <class T> struct are_random_access_iterators<T> { static constexpr bool value = - is_iterator_v<T> && std::is_base_of<std::random_access_iterator_tag, - typename T::iterator_category>::value; + is_iterator_v<T> && std::is_base_of_v<std::random_access_iterator_tag, + typename T::iterator_category>; }; template <class Head, class... Tail> @@ -150,9 +165,8 @@ struct iterators_have_matching_difference_type<T> { template <class T1, class T2> struct iterators_have_matching_difference_type<T1, T2> { - static constexpr bool value = - std::is_same<typename T1::difference_type, - typename T2::difference_type>::value; + static constexpr bool value = std::is_same_v<typename T1::difference_type, + typename T2::difference_type>; }; template <class T1, class T2, class... Tail> @@ -215,6 +229,38 @@ KOKKOS_INLINE_FUNCTION void expect_valid_range(IteratorType first, (void)last; } +// +// Check if kokkos iterators are overlapping +// +template <typename IteratorType1, typename IteratorType2> +KOKKOS_INLINE_FUNCTION void expect_no_overlap( + [[maybe_unused]] IteratorType1 first, [[maybe_unused]] IteratorType1 last, + [[maybe_unused]] IteratorType2 s_first) { + if constexpr (is_kokkos_iterator_v<IteratorType1> && + is_kokkos_iterator_v<IteratorType2>) { + auto const view1 = first.view(); + auto const view2 = s_first.view(); + + std::size_t stride1 = view1.stride(0); + std::size_t stride2 = view2.stride(0); + ptrdiff_t first_diff = view1.data() - view2.data(); + + // FIXME If strides are not identical, checks may not be made + // with the cost of O(1) + // Currently, checks are made only if strides are identical + // If first_diff == 0, there is already an overlap + if (stride1 == stride2 || first_diff == 0) { + [[maybe_unused]] bool is_no_overlap = (first_diff % stride1); + auto* first_pointer1 = view1.data(); + auto* first_pointer2 = view2.data(); + [[maybe_unused]] auto* last_pointer1 = first_pointer1 + (last - first); + [[maybe_unused]] auto* last_pointer2 = first_pointer2 + (last - first); + KOKKOS_EXPECTS(first_pointer1 >= last_pointer2 || + last_pointer1 <= first_pointer2 || is_no_overlap); + } + } +} + } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index 3c1e2474bc9ce7dd82c1030e6f771bf2e710c174..ad7b8bb8cab6a95506ad88b9a42ad0a54077f60d 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -150,8 +150,9 @@ KOKKOS_FUNCTION OutputIterator copy_if_team_impl( return d_first + count; } -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp index d3be3b7f6670384c2ddc2042515fd69091bf1db3..99cc4a1cf3a6911bc742d1fcb2e8b621c027a6b0 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ForEachForEachN.hpp @@ -42,10 +42,9 @@ struct StdForEachFunctor { }; template <class HandleType, class IteratorType, class UnaryFunctorType> -UnaryFunctorType for_each_exespace_impl(const std::string& label, - const HandleType& handle, - IteratorType first, IteratorType last, - UnaryFunctorType functor) { +void for_each_exespace_impl(const std::string& label, const HandleType& handle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(handle, first); Impl::expect_valid_range(first, last); @@ -56,8 +55,6 @@ UnaryFunctorType for_each_exespace_impl(const std::string& label, label, RangePolicy<HandleType>(handle, 0, num_elements), StdForEachFunctor<IteratorType, UnaryFunctorType>(first, functor)); handle.fence("Kokkos::for_each: fence after operation"); - - return functor; } template <class ExecutionSpace, class IteratorType, class SizeType, @@ -75,7 +72,7 @@ IteratorType for_each_n_exespace_impl(const std::string& label, } for_each_exespace_impl(label, ex, first, last, std::move(functor)); - // no neeed to fence since for_each_exespace_impl fences already + // no need to fence since for_each_exespace_impl fences already return last; } @@ -84,9 +81,9 @@ IteratorType for_each_n_exespace_impl(const std::string& label, // team impl // template <class TeamHandleType, class IteratorType, class UnaryFunctorType> -KOKKOS_FUNCTION UnaryFunctorType -for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, - IteratorType last, UnaryFunctorType functor) { +KOKKOS_FUNCTION void for_each_team_impl(const TeamHandleType& teamHandle, + IteratorType first, IteratorType last, + UnaryFunctorType functor) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); Impl::expect_valid_range(first, last); @@ -96,7 +93,6 @@ for_each_team_impl(const TeamHandleType& teamHandle, IteratorType first, TeamThreadRange(teamHandle, 0, num_elements), StdForEachFunctor<IteratorType, UnaryFunctorType>(first, functor)); teamHandle.team_barrier(); - return functor; } template <class TeamHandleType, class IteratorType, class SizeType, @@ -113,7 +109,7 @@ for_each_n_team_impl(const TeamHandleType& teamHandle, IteratorType first, } for_each_team_impl(teamHandle, first, last, std::move(functor)); - // no neeed to fence since for_each_team_impl fences already + // no need to fence since for_each_team_impl fences already return last; } diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp index 8151ee34955295b1c658b974b829d5bbb78440d5..5a7fe16984a2b336b25cef45e7898db461f97cde 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = m_first_from[i]; if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; + update += tmp; } }; @@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_first_from[i], false}; if (final_pass) { if (i == 0) { m_first_dest[i] = m_init_value; @@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { } } - const auto tmp = value_type{m_first_from[i], false}; this->join(update, tmp); } @@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper { } } - const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; this->join(update, tmp); } @@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { } } - const auto tmp = ValueType{m_unary_op(m_first_from[i])}; this->join(update, tmp); } diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp index 9075562d460e635ce9c6f2214ce8846ba1847baa..dc910861d507773d27c48a56c9a4aeca82dd20b7 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp @@ -30,7 +30,7 @@ namespace Impl { template <class IteratorType1, class IteratorType2> struct StdMoveBackwardFunctor { using index_type = typename IteratorType1::difference_type; - static_assert(std::is_signed<index_type>::value, + static_assert(std::is_signed_v<index_type>, "Kokkos: StdMoveBackwardFunctor requires signed index type"); IteratorType1 m_last; diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5c9854b87d7044c1ed252ff09438fb91be33168c..e8c638c94c75478de3c0e9069507e9a789ae5b10 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -36,18 +36,18 @@ class RandomAccessIterator< ::Kokkos::View<DataType, Args...> > { using iterator_type = RandomAccessIterator<view_type>; using iterator_category = std::random_access_iterator_tag; - using value_type = typename view_type::value_type; + using value_type = typename view_type::non_const_value_type; using difference_type = ptrdiff_t; using pointer = typename view_type::pointer_type; using reference = typename view_type::reference_type; static_assert(view_type::rank == 1 && - (std::is_same<typename view_type::traits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename view_type::traits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename view_type::traits::array_layout, - Kokkos::LayoutStride>::value), + (std::is_same_v<typename view_type::traits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename view_type::traits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename view_type::traits::array_layout, + Kokkos::LayoutStride>), "RandomAccessIterator only supports 1D Views with LayoutLeft, " "LayoutRight, LayoutStride."); @@ -59,6 +59,30 @@ class RandomAccessIterator< ::Kokkos::View<DataType, Args...> > { ptrdiff_t current_index) : m_view(view), m_current_index(current_index) {} +#ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond + template <class OtherViewType> + requires(std::is_constructible_v<view_type, OtherViewType>) + KOKKOS_FUNCTION explicit(!std::is_convertible_v<OtherViewType, view_type>) + RandomAccessIterator(const RandomAccessIterator<OtherViewType>& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#else + template < + class OtherViewType, + std::enable_if_t<std::is_constructible_v<view_type, OtherViewType> && + !std::is_convertible_v<OtherViewType, view_type>, + int> = 0> + KOKKOS_FUNCTION explicit RandomAccessIterator( + const RandomAccessIterator<OtherViewType>& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} + + template <class OtherViewType, + std::enable_if_t<std::is_convertible_v<OtherViewType, view_type>, + int> = 0> + KOKKOS_FUNCTION RandomAccessIterator( + const RandomAccessIterator<OtherViewType>& other) + : m_view(other.m_view), m_current_index(other.m_current_index) {} +#endif + KOKKOS_FUNCTION iterator_type& operator++() { ++m_current_index; @@ -152,9 +176,16 @@ class RandomAccessIterator< ::Kokkos::View<DataType, Args...> > { KOKKOS_FUNCTION reference operator*() const { return m_view(m_current_index); } + KOKKOS_FUNCTION + view_type view() const { return m_view; } + private: view_type m_view; ptrdiff_t m_current_index = 0; + + // Needed for the converting constructor accepting another iterator + template <class> + friend class RandomAccessIterator; }; } // namespace Impl diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp index 50224c8874ed19ebe7041d6f0be8a15e2c5001cb..456df43aed2124ed00f0297dc2b55c4ce8006034 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp @@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor { void operator()(const IndexType i, IndexType& update, const bool final_pass) const { auto& myval = m_first_from[i]; - if (final_pass) { - if (!m_must_remove(myval)) { + + if (!m_must_remove(myval)) { + if (final_pass) { // calling move here is ok because we are inside final pass // we are calling move assign as specified by the std m_first_dest[update] = std::move(myval); } - } - if (!m_must_remove(myval)) { update += 1; } } @@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label, // create helper tmp view using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>; - tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count); + tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex, + "std_remove_if_tmp_view"), + keep_count); using tmp_readwrite_iterator_type = decltype(begin(tmp_view)); // in stage 1, *move* all elements to keep from original range to tmp diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index 428dc0d744a40f97f73ccd120120bb6795f57643..e6caa07288051a436e6c12ed0ced8c5fbf8edaf6 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include <std_algorithms/Kokkos_Distance.hpp> -#include <std_algorithms/Kokkos_Swap.hpp> #include <string> namespace Kokkos { @@ -31,7 +30,7 @@ namespace Impl { template <class InputIterator> struct StdReverseFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed<index_type>::value, + static_assert(std::is_signed_v<index_type>, "Kokkos: StdReverseFunctor requires signed index type"); InputIterator m_first; @@ -39,7 +38,7 @@ struct StdReverseFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]); + ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]); } KOKKOS_FUNCTION diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp index dd20d90e399536fff3dfe939216c58e7aca0ed5d..7aa0e4fc44c8cb01f9bb1d77c6c8060eebf49ee0 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp @@ -30,7 +30,7 @@ namespace Impl { template <class InputIterator, class OutputIterator> struct StdReverseCopyFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed<index_type>::value, + static_assert(std::is_signed_v<index_type>, "Kokkos: StdReverseCopyFunctor requires signed index type"); InputIterator m_last; diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp index 50bc7c8d610aebd3980cceca6c1b73ee8f06112f..94147485071a684987a41472dde057fd0391fd8d 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp @@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first + n, last); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { first[i] = std::move(first[i + n]); } }); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp index cac20bfbba6af9d402c3f79c46c00c07e6ac7fb6..0414e6f1c25105c41ae60486c5099d74dd78195c 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp @@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl( return first + n; } -template <class Iterator> -struct StdShiftRightTeamSingleFunctor { - Iterator m_first; - Iterator m_last; - std::size_t m_shift; - - KOKKOS_FUNCTION - void operator()() const { - // the impl function calling this functor guarantees that - // - m_shift is non-negative - // - m_first, m_last identify a valid range with m_last > m_first - // - m_shift is less than m_last - m_first - // so I can safely use std::size_t here - } - - KOKKOS_FUNCTION - StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n) - : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {} -}; - template <class TeamHandleType, class IteratorType> KOKKOS_FUNCTION IteratorType shift_right_team_impl( const TeamHandleType& teamHandle, IteratorType first, IteratorType last, @@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first, last - n); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { last[-i - 1] = std::move(last[-n - i - 1]); } }); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp index 5bc77ed7ddcdbbe5ab6b8d260a489a0f569b1f42..930a14ac48c3b6b0b3bd3569e8345bf881b02311 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include <std_algorithms/Kokkos_Distance.hpp> -#include <std_algorithms/Kokkos_Swap.hpp> #include <string> namespace Kokkos { @@ -36,7 +35,7 @@ struct StdSwapRangesFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]); + ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]); } KOKKOS_FUNCTION diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp index 11afa8ed6e082c2d292eb68f36a280ccba7d5b33..2863582458577ad56b86c8e2a18a55ee0431a69a 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp @@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label, // using the same algorithm used for unique_copy but we now move things using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View<value_type*, ExecutionSpace>; - tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore); + tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing, + "std_unique_tmp_view"), + num_elements_to_explore); // scan extent is: num_elements_to_explore - 1 // for same reason as the one explained in unique_copy diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp index c7c293027862b27b1bc1d5d74d58c6753dc98da3..710d04805d8fea98544be6e00e1363f687ba885a 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp @@ -175,8 +175,9 @@ KOKKOS_FUNCTION OutputIterator unique_copy_team_impl( d_first + count); } -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt index 419f5ec1d132564169425f3f90abf195fd801ade..31247af159b9bfc772218f0486b899156c3e48b1 100644 --- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,12 +1,10 @@ - #Leave these here for now - I don't need transitive deps anyway -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) -SET(ALGORITHM UnitTestMain.cpp) +set(ALGORITHM UnitTestMain.cpp) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) string(TOUPPER ${Tag} DEVICE) @@ -23,20 +21,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Generate a .cpp file for each one that runs it on the current backend (Tag), # and add this .cpp file to the sources for UnitTest_RandomAndSort. set(ALGO_SORT_SOURCES) - foreach(SOURCE_Input - TestSort - TestSortCustomComp - TestBinSortA - TestBinSortB - TestNestedSort - ) + foreach(SOURCE_Input TestSort TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB TestNestedSort) set(file ${dir}/${SOURCE_Input}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${Tag}_Category.hpp>\n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_SORT_SOURCES ${file}) endforeach() @@ -46,323 +35,276 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # ------------------------------------------ # do as above set(ALGO_RANDOM_SOURCES) - foreach(SOURCE_Input - TestRandom - ) + foreach(SOURCE_Input TestRandom) set(file ${dir}/${SOURCE_Input}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include <Test${Tag}_Category.hpp>\n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() + endif() +endforeach() - # ------------------------------------------ - # std set A - # ------------------------------------------ - set(STDALGO_SOURCES_A) - foreach(Name - StdReducers - StdAlgorithmsConstraints - RandomAccessIterator - ) - list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) - endforeach() - - # ------------------------------------------ - # std set B - # ------------------------------------------ - set(STDALGO_SOURCES_B) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsMinMaxElementOps - ) - list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std set A +# ------------------------------------------ +set(STDALGO_SOURCES_A) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator) + list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set C - # ------------------------------------------ - set(STDALGO_SOURCES_C) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsLexicographicalCompare - StdAlgorithmsForEach - StdAlgorithmsFind - StdAlgorithmsFindFirstOf - StdAlgorithmsFindEnd - StdAlgorithmsCount - StdAlgorithmsEqual - StdAlgorithmsAllAnyNoneOf - StdAlgorithmsAdjacentFind - StdAlgorithmsSearch - StdAlgorithmsSearch_n - StdAlgorithmsMismatch - StdAlgorithmsMoveBackward - ) - list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std set B +# ------------------------------------------ +set(STDALGO_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps) + list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set D - # ------------------------------------------ - set(STDALGO_SOURCES_D) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsModOps - StdAlgorithmsModSeqOps - StdAlgorithmsReplace - StdAlgorithmsReplaceIf - StdAlgorithmsReplaceCopy - StdAlgorithmsReplaceCopyIf - StdAlgorithmsCopyIf - StdAlgorithmsUnique - StdAlgorithmsUniqueCopy - StdAlgorithmsRemove - StdAlgorithmsRemoveIf - StdAlgorithmsRemoveCopy - StdAlgorithmsRemoveCopyIf - StdAlgorithmsRotate - StdAlgorithmsRotateCopy - StdAlgorithmsReverse - StdAlgorithmsShiftLeft - StdAlgorithmsShiftRight - ) - list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std set C +# ------------------------------------------ +set(STDALGO_SOURCES_C) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsLexicographicalCompare + StdAlgorithmsForEach + StdAlgorithmsFind + StdAlgorithmsFindFirstOf + StdAlgorithmsFindEnd + StdAlgorithmsCount + StdAlgorithmsEqual + StdAlgorithmsAllAnyNoneOf + StdAlgorithmsAdjacentFind + StdAlgorithmsSearch + StdAlgorithmsSearch_n + StdAlgorithmsMismatch + StdAlgorithmsMoveBackward +) + list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set E - # ------------------------------------------ - set(STDALGO_SOURCES_E) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsIsSorted - StdAlgorithmsIsSortedUntil - StdAlgorithmsPartitioningOps - StdAlgorithmsPartitionCopy - StdAlgorithmsNumerics - StdAlgorithmsAdjacentDifference - StdAlgorithmsExclusiveScan - StdAlgorithmsInclusiveScan - StdAlgorithmsTransformUnaryOp - StdAlgorithmsTransformExclusiveScan - StdAlgorithmsTransformInclusiveScan - ) - list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std set D +# ------------------------------------------ +set(STDALGO_SOURCES_D) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsModOps + StdAlgorithmsModSeqOps + StdAlgorithmsReplace + StdAlgorithmsReplaceIf + StdAlgorithmsReplaceCopy + StdAlgorithmsReplaceCopyIf + StdAlgorithmsCopyIf + StdAlgorithmsUnique + StdAlgorithmsUniqueCopy + StdAlgorithmsRemove + StdAlgorithmsRemoveIf + StdAlgorithmsRemoveCopy + StdAlgorithmsRemoveCopyIf + StdAlgorithmsRotate + StdAlgorithmsRotateCopy + StdAlgorithmsReverse + StdAlgorithmsShiftLeft + StdAlgorithmsShiftRight +) + list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team Q - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_Q) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamInclusiveScan - StdAlgorithmsTeamTransformInclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std set E +# ------------------------------------------ +set(STDALGO_SOURCES_E) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsIsSorted + StdAlgorithmsIsSortedUntil + StdAlgorithmsPartitioningOps + StdAlgorithmsPartitionCopy + StdAlgorithmsNumerics + StdAlgorithmsAdjacentDifference + StdAlgorithmsExclusiveScan + StdAlgorithmsInclusiveScan + StdAlgorithmsTransformUnaryOp + StdAlgorithmsTransformExclusiveScan + StdAlgorithmsTransformInclusiveScan +) + list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team P - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_P) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamExclusiveScan - StdAlgorithmsTeamTransformExclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team Q +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_Q) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan) + list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team M - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_M) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamTransformUnaryOp - StdAlgorithmsTeamTransformBinaryOp - StdAlgorithmsTeamGenerate - StdAlgorithmsTeamGenerate_n - StdAlgorithmsTeamSwapRanges - ) - list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team P +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_P) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) + list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team L - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_L) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamIsSorted - StdAlgorithmsTeamIsSortedUntil - StdAlgorithmsTeamIsPartitioned - StdAlgorithmsTeamPartitionCopy - StdAlgorithmsTeamPartitionPoint - ) - list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team M +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_M) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp + StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges +) + list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team I - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_I) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamUnique - StdAlgorithmsTeamAdjacentDifference - StdAlgorithmsTeamReduce - StdAlgorithmsTeamTransformReduce - ) - list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team L +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_L) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil + StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint +) + list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team H - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_H) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamCopy - StdAlgorithmsTeamCopy_n - StdAlgorithmsTeamCopyBackward - StdAlgorithmsTeamCopyIf - StdAlgorithmsTeamUniqueCopy - StdAlgorithmsTeamRemove - StdAlgorithmsTeamRemoveIf - StdAlgorithmsTeamRemoveCopy - StdAlgorithmsTeamRemoveCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team I +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_I) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce + StdAlgorithmsTeamTransformReduce +) + list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team G - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_G) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMove - StdAlgorithmsTeamMoveBackward - StdAlgorithmsTeamShiftLeft - StdAlgorithmsTeamShiftRight - ) - list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team H +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_H) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamCopy + StdAlgorithmsTeamCopy_n + StdAlgorithmsTeamCopyBackward + StdAlgorithmsTeamCopyIf + StdAlgorithmsTeamUniqueCopy + StdAlgorithmsTeamRemove + StdAlgorithmsTeamRemoveIf + StdAlgorithmsTeamRemoveCopy + StdAlgorithmsTeamRemoveCopyIf +) + list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team F - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_F) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamReverse - StdAlgorithmsTeamReverseCopy - StdAlgorithmsTeamRotate - StdAlgorithmsTeamRotateCopy - ) - list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team G +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_G) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft + StdAlgorithmsTeamShiftRight +) + list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team E - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_E) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFill - StdAlgorithmsTeamFill_n - StdAlgorithmsTeamReplace - StdAlgorithmsTeamReplaceIf - StdAlgorithmsTeamReplaceCopy - StdAlgorithmsTeamReplaceCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team F +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_F) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate + StdAlgorithmsTeamRotateCopy +) + list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team D - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_D) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMinElement - StdAlgorithmsTeamMaxElement - StdAlgorithmsTeamMinMaxElement - ) - list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team E +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_E) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFill + StdAlgorithmsTeamFill_n + StdAlgorithmsTeamReplace + StdAlgorithmsTeamReplaceIf + StdAlgorithmsTeamReplaceCopy + StdAlgorithmsTeamReplaceCopyIf +) + list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team C - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_C) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFind - StdAlgorithmsTeamFindIf - StdAlgorithmsTeamFindIfNot - StdAlgorithmsTeamAllOf - StdAlgorithmsTeamAnyOf - StdAlgorithmsTeamNoneOf - StdAlgorithmsTeamSearchN - ) - list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team D +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement) + list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team B - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_B) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamEqual - StdAlgorithmsTeamSearch - StdAlgorithmsTeamFindEnd - StdAlgorithmsTeamFindFirstOf - ) - list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team C +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_C) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFind + StdAlgorithmsTeamFindIf + StdAlgorithmsTeamFindIfNot + StdAlgorithmsTeamAllOf + StdAlgorithmsTeamAnyOf + StdAlgorithmsTeamNoneOf + StdAlgorithmsTeamSearchN +) + list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team A - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_A) - foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamAdjacentFind - StdAlgorithmsTeamCount - StdAlgorithmsTeamCountIf - StdAlgorithmsTeamForEach - StdAlgorithmsTeamForEachN - StdAlgorithmsTeamLexicographicalCompare - StdAlgorithmsTeamMismatch - ) - list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) - endforeach() +# ------------------------------------------ +# std team B +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd + StdAlgorithmsTeamFindFirstOf +) + list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) +endforeach() - endif() +# ------------------------------------------ +# std team A +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_A) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamAdjacentFind + StdAlgorithmsTeamCount + StdAlgorithmsTeamCountIf + StdAlgorithmsTeamForEach + StdAlgorithmsTeamForEachN + StdAlgorithmsTeamLexicographicalCompare + StdAlgorithmsTeamMismatch +) + list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - list(REMOVE_ITEM ALGO_SORT_SOURCES - TestSort.cpp - ) +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL 16.0.0 +) + list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L - TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp - TestStdAlgorithmsTeamPartitionCopy.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp + TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp ) endif() @@ -370,7 +312,9 @@ endif() # in these cases the impl needs to use either Kokkos or # tailored reducers which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C + list( + REMOVE_ITEM + STDALGO_TEAM_SOURCES_C TestStdAlgorithmsTeamFind.cpp TestStdAlgorithmsTeamFindIf.cpp TestStdAlgorithmsTeamFindIfNot.cpp @@ -386,35 +330,20 @@ endif() # FRIZZI: 04/26/2023: not sure if the compilation error is still applicable # but we conservatively leave this guard on if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Sort - SOURCES - UnitTestMain.cpp - TestStdAlgorithmsCommon.cpp - ${ALGO_SORT_SOURCES} + kokkos_add_executable_and_test( + UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Random - SOURCES - UnitTestMain.cpp - ${ALGO_RANDOM_SOURCES} - ) + kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) endif() # FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM STDALGO_SOURCES_D - TestStdAlgorithmsCopyIf.cpp - TestStdAlgorithmsRemoveCopy.cpp - TestStdAlgorithmsUnique.cpp - TestStdAlgorithmsUniqueCopy.cpp - ) - list(REMOVE_ITEM STDALGO_SOURCES_E - TestStdAlgorithmsExclusiveScan.cpp - TestStdAlgorithmsInclusiveScan.cpp + list(REMOVE_ITEM STDALGO_SOURCES_D TestStdAlgorithmsCopyIf.cpp TestStdAlgorithmsRemoveCopy.cpp + TestStdAlgorithmsUnique.cpp TestStdAlgorithmsUniqueCopy.cpp ) + list(REMOVE_ITEM STDALGO_SOURCES_E TestStdAlgorithmsExclusiveScan.cpp TestStdAlgorithmsInclusiveScan.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget @@ -422,48 +351,31 @@ endif() if(KOKKOS_ENABLE_OPENMPTARGET) # the following use either Kokkos or tailored reducers # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B - TestStdAlgorithmsTeamFindEnd.cpp - TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp + TestStdAlgorithmsTeamSearch.cpp ) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A - TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp - TestStdAlgorithmsTeamMismatch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp + TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp ) # this causes an illegal memory access if team_members_have_matching_result # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M - TestStdAlgorithmsTeamTransformBinaryOp.cpp - ) + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) endif() foreach(ID A;B;C;D;E) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_SOURCES_${ID}} - ) + kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_Team_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_TEAM_SOURCES_${ID}} - ) + kokkos_add_executable_and_test( + AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES UnitTestMain.cpp ${STDALGO_TEAM_SOURCES_${ID}} + ) endforeach() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE( - AlgorithmsUnitTest_StdAlgoCompileOnly - SOURCES TestStdAlgorithmsCompileOnly.cpp - ) + kokkos_add_executable(AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp) endif() diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile index 601217799a88bff73f467d2311bdba313ef32bb4..d3946c149baf87dbc16c317f948e9634a5c84792 100644 --- a/packages/kokkos/algorithms/unit_tests/Makefile +++ b/packages/kokkos/algorithms/unit_tests/Makefile @@ -27,13 +27,13 @@ TARGETS = tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ - $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \ - $(shell echo "\#include <TestRandom.hpp>" >> Test$(device).cpp); \ - $(shell echo "\#include <TestSort.hpp>" >> Test$(device).cpp); \ - $(shell echo "\#include <TestBinSortA.hpp>" >> Test$(device).cpp); \ - $(shell echo "\#include <TestBinSortB.hpp>" >> Test$(device).cpp); \ - $(shell echo "\#include <TestNestedSort.hpp>" >> Test$(device).cpp); \ - $(shell echo "\#include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \ + $(shell echo "$(H)include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \ + $(shell echo "$(H)include <TestRandom.hpp>" >> Test$(device).cpp); \ + $(shell echo "$(H)include <TestSort.hpp>" >> Test$(device).cpp); \ + $(shell echo "$(H)include <TestBinSortA.hpp>" >> Test$(device).cpp); \ + $(shell echo "$(H)include <TestBinSortB.hpp>" >> Test$(device).cpp); \ + $(shell echo "$(H)include <TestNestedSort.hpp>" >> Test$(device).cpp); \ + $(shell echo "$(H)include <TestSortCustomComp.hpp>" >> Test$(device).cpp); \ ) \ ) diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp index dd3569e6715a8ef5e8c99b4bb0ba28702b6fe13d..bb074f248034a1d3f64632fb043bc21636dfed9b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -31,13 +31,13 @@ struct bin3d_is_sorted_struct { using value_type = unsigned int; using execution_space = ExecutionSpace; - Kokkos::View<Scalar * [3], ExecutionSpace> keys; + Kokkos::View<Scalar* [3], ExecutionSpace> keys; int max_bins; Scalar min; Scalar max; - bin3d_is_sorted_struct(Kokkos::View<Scalar * [3], ExecutionSpace> keys_, + bin3d_is_sorted_struct(Kokkos::View<Scalar* [3], ExecutionSpace> keys_, int max_bins_, Scalar min_, Scalar max_) : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} KOKKOS_INLINE_FUNCTION @@ -65,9 +65,9 @@ struct sum3D { using value_type = double; using execution_space = ExecutionSpace; - Kokkos::View<Scalar * [3], ExecutionSpace> keys; + Kokkos::View<Scalar* [3], ExecutionSpace> keys; - sum3D(Kokkos::View<Scalar * [3], ExecutionSpace> keys_) : keys(keys_) {} + sum3D(Kokkos::View<Scalar* [3], ExecutionSpace> keys_) : keys(keys_) {} KOKKOS_INLINE_FUNCTION void operator()(int i, double& count) const { count += keys(i, 0); @@ -77,8 +77,8 @@ struct sum3D { }; template <class ExecutionSpace, typename KeyType> -void test_3D_sort_impl(unsigned int n) { - using KeyViewType = Kokkos::View<KeyType * [3], ExecutionSpace>; +void test_3D_sort_impl(size_t n) { + using KeyViewType = Kokkos::View<KeyType* [3], ExecutionSpace>; KeyViewType keys("Keys", n * n * n); @@ -207,7 +207,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max<T>::value, - Kokkos::Experimental::finite_min<T>::value}; + Kokkos::Experimental::finite_min<T>::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View<T[2], Kokkos::HostSpace>(a)); Kokkos::sort(vd); @@ -219,6 +219,10 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -246,11 +250,11 @@ TEST(TEST_CATEGORY, BinSortEmptyView) { // does not matter if we use int or something else Kokkos::View<int*, ExecutionSpace> v("v", 0); - // test all exposed public sort methods - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Sorter.sort(v)); + // test all exposed public sort methods are callable and do not throw + Sorter.sort(ExecutionSpace(), v, 0, 0); + Sorter.sort(v, 0, 0); + Sorter.sort(ExecutionSpace(), v); + Sorter.sort(v); } TEST(TEST_CATEGORY, BinSortEmptyKeysView) { @@ -263,7 +267,26 @@ TEST(TEST_CATEGORY, BinSortEmptyKeysView) { BinOp_t binOp(5, 0, 10); Kokkos::BinSort<KeyViewType, BinOp_t> Sorter(ExecutionSpace{}, kv, binOp); - ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw +} + +// BinSort may delegate sorting within bins to std::sort when running on host +// and having a sufficiently large number of items within a single bin (10 by +// default). Test that this is done without undefined behavior when accessing +// the boundaries of the bin. Should be used in conjunction with a memory +// sanitizer or bounds check. +TEST(TEST_CATEGORY, BinSort_issue_7221) { + using ExecutionSpace = TEST_EXECSPACE; + + using KeyViewType = Kokkos::View<int*, ExecutionSpace>; + KeyViewType kv("kv", 11); + + using BinOp_t = Kokkos::BinOp1D<KeyViewType>; + BinOp_t binOp(1, -10, 10); + Kokkos::BinSort<KeyViewType, BinOp_t> Sorter(ExecutionSpace{}, kv, binOp, + /*sort_within_bins*/ true); + + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw } } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp index a90224bf31589433ba028e36385b74abdeffd8b5..d11b53a9a61fd2f70f0bbaa888d7951a4955bedb 100644 --- a/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -185,6 +185,10 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1<ExeSpace, key_type, int>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 1b7a3f48fc521fa38f87f7aa077e06efd53e495b..cd57fd23ecfa6f063ec75f9eb370e544fc729aaf 100644 --- a/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -386,6 +386,11 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort<ExecutionSpace, unsigned>(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort<ExecutionSpace, float>(42, -1e6f, 1e6f); @@ -394,6 +399,11 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp index 472af1403b2de44a502500b2f75fe62bb14f5304..6960b912d0e3fc36e9e913312e3cbd8219012b81 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -542,6 +542,11 @@ void test_duplicate_stream() { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -562,6 +567,10 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -589,7 +598,7 @@ TEST(TEST_CATEGORY, Multi_streams) { #endif #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if constexpr (std::is_same_v<ExecutionSpace, Kokkos::Experimental::SYCL>) { + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::SYCL>) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL } #endif diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 282d85548c55411e4e5c7bedd3a8c3f12948cd47..5ab348cb19333b56eed5ebf19fea00e7cbe6d47c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct random_access_iterator_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor<static_view_t>(m_static_view)); @@ -46,6 +46,44 @@ TEST_F(random_access_iterator_test, constructor) { EXPECT_TRUE(true); } +TEST_F(random_access_iterator_test, constructiblity) { + auto first_d = KE::begin(m_dynamic_view); + auto cfirst_d = KE::cbegin(m_dynamic_view); + + static_assert(std::is_constructible_v<decltype(cfirst_d), decltype(first_d)>); + static_assert( + !std::is_constructible_v<decltype(first_d), decltype(cfirst_d)>); + [[maybe_unused]] decltype(cfirst_d) tmp_cfirst_d(first_d); + + auto first_s = KE::begin(m_static_view); + auto cfirst_s = KE::cbegin(m_static_view); + + static_assert(std::is_constructible_v<decltype(cfirst_s), decltype(first_s)>); + static_assert( + !std::is_constructible_v<decltype(first_s), decltype(cfirst_s)>); + [[maybe_unused]] decltype(cfirst_s) tmp_cfirst_s(first_s); + + auto first_st = KE::begin(m_strided_view); + auto cfirst_st = KE::cbegin(m_strided_view); + + static_assert( + std::is_constructible_v<decltype(cfirst_st), decltype(first_st)>); + static_assert( + !std::is_constructible_v<decltype(first_st), decltype(cfirst_st)>); + [[maybe_unused]] decltype(cfirst_st) tmp_cfirst_st(first_st); + + // [FIXME] Better to have tests for the explicit specifier with an expression. + // As soon as View converting constructors are re-implemented with a + // conditional explicit, we may add those tests. + static_assert(std::is_constructible_v<decltype(first_s), decltype(first_d)>); + static_assert(std::is_constructible_v<decltype(first_st), decltype(first_d)>); + static_assert(std::is_constructible_v<decltype(first_d), decltype(first_s)>); + static_assert(std::is_constructible_v<decltype(first_st), decltype(first_s)>); + static_assert(std::is_constructible_v<decltype(first_d), decltype(first_st)>); + static_assert(std::is_constructible_v<decltype(first_s), decltype(first_st)>); + EXPECT_TRUE(true); +} + template <class IteratorType, class ValueType> void test_random_access_it_verify(IteratorType it, ValueType gold_value) { using view_t = Kokkos::View<typename IteratorType::value_type>; @@ -226,6 +264,37 @@ TEST_F(random_access_iterator_test, traits_helpers) { static_assert(KE::Impl::are_iterators_v<T1_t, T2_t, T3_t>); static_assert(KE::Impl::are_random_access_iterators_v<T1_t, T2_t, T3_t>); static_assert(!KE::Impl::are_iterators_v<int, T2_t, T3_t>); + + static_assert(std::is_same_v<decltype(KE::begin(m_static_view))::value_type, + value_type>); + static_assert(std::is_same_v<decltype(KE::begin(m_dynamic_view))::value_type, + value_type>); + static_assert(std::is_same_v<decltype(KE::begin(m_strided_view))::value_type, + value_type>); + + static_assert( + std::is_same_v<decltype(KE::end(m_static_view))::value_type, value_type>); + static_assert(std::is_same_v<decltype(KE::end(m_dynamic_view))::value_type, + value_type>); + static_assert(std::is_same_v<decltype(KE::end(m_strided_view))::value_type, + value_type>); + + static_assert( + std::is_same_v<decltype(KE::begin(m_static_view))::value_type, + decltype(KE::cbegin(m_static_view))::value_type>); + static_assert( + std::is_same_v<decltype(KE::begin(m_dynamic_view))::value_type, + decltype(KE::cbegin(m_dynamic_view))::value_type>); + static_assert( + std::is_same_v<decltype(KE::begin(m_strided_view))::value_type, + decltype(KE::cbegin(m_strided_view))::value_type>); + + static_assert(std::is_same_v<decltype(KE::end(m_static_view))::value_type, + decltype(KE::cend(m_static_view))::value_type>); + static_assert(std::is_same_v<decltype(KE::end(m_dynamic_view))::value_type, + decltype(KE::cend(m_dynamic_view))::value_type>); + static_assert(std::is_same_v<decltype(KE::end(m_strided_view))::value_type, + decltype(KE::cend(m_strided_view))::value_type>); } } // namespace stdalgos diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp index 968fb8950b74892fff7928d00f3a7a0af380732b..5ea88ae5d628c131dd7adfa446fa2b6a47c4589d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp @@ -197,7 +197,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max<T>::value, - Kokkos::Experimental::finite_min<T>::value}; + Kokkos::Experimental::finite_min<T>::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View<T[2], Kokkos::HostSpace>(a)); Kokkos::sort(vd); @@ -209,6 +209,10 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -224,14 +228,19 @@ TEST(TEST_CATEGORY, SortUnsignedValueType) { } TEST(TEST_CATEGORY, SortEmptyView) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else Kokkos::View<int*, ExecutionSpace> v("v", 0); + // checking that it does not throw // TODO check the synchronous behavior of the calls below - ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Kokkos::sort(v)); + Kokkos::sort(ExecutionSpace(), v); + Kokkos::sort(v); } } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp new file mode 100644 index 0000000000000000000000000000000000000000..44abe4e73a4b171e43f90f82f186b544f1cbe92d --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -0,0 +1,255 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> +#include <Kokkos_Random.hpp> +#include <Kokkos_Sort.hpp> + +#include <utility> // pair + +namespace Test { +namespace SortImpl { + +struct Less { + template <class ValueType> + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs < rhs; + } +}; + +struct Greater { + template <class ValueType> + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs > rhs; + } +}; + +template <class ExecutionSpace, class Keys, class Permute, + class Comparator = Less> +struct is_sorted_by_key_struct { + Keys keys; + Keys keys_orig; + Permute permute; + Comparator comparator; + + is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_, + Comparator comparator_ = Comparator{}) + : keys(keys_), + keys_orig(keys_orig_), + permute(permute_), + comparator(comparator_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int &count) const { + if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count; + if (keys(i) != keys_orig(permute(i))) ++count; + } +}; + +template <typename ExecutionSpace, typename ViewType> +void iota(ExecutionSpace const &space, ViewType const &v, + typename ViewType::value_type value = 0) { + using ValueType = typename ViewType::value_type; + Kokkos::parallel_for( + "Kokkos::Algorithms::iota", + Kokkos::RangePolicy<ExecutionSpace>(space, 0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; }); +} + +} // namespace SortImpl + +TEST(TEST_CATEGORY, SortByKeyEmptyView) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View<int *, ExecutionSpace> keys("keys", 0); + Kokkos::View<float *, ExecutionSpace> values("values", 0); + + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); +} + +// Test #7036 +TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { + using ExecutionSpace = Kokkos::DefaultHostExecutionSpace; + + // does not matter if we use int or something else + Kokkos::View<int *, ExecutionSpace> keys("keys", 0); + Kokkos::View<float *, ExecutionSpace> values("values", 0); + + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); +} + +TEST(TEST_CATEGORY, SortByKey) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + for (auto keys_vector : {std::vector<int>{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector<int>{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector<int>{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector<int>{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View<int *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View<int *, ExecutionSpace> permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecutionSpace>(space, 0, n), + SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys), + decltype(permute)>(keys, keys_orig, + permute), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyWithComparator) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + SortImpl::Greater comparator; + + for (auto keys_vector : {std::vector<int>{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector<int>{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector<int>{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector<int>{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View<int *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View<int *, ExecutionSpace> permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute, comparator); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecutionSpace>(space, 0, n), + SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys), + decltype(permute), SortImpl::Greater>( + keys, keys_orig, permute, comparator), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyStaticExtents) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + Kokkos::View<int[10], ExecutionSpace> keys("keys"); + + Kokkos::View<int[10], ExecutionSpace> values_static("values_static"); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_static); + + Kokkos::View<int *, ExecutionSpace> values_dynamic("values_dynamic", 10); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic); +} + +template <typename ExecutionSpace, typename Keys, typename Values> +void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys, + Values &values) { + Kokkos::parallel_for( + "create_data", + Kokkos::MDRangePolicy<Kokkos::Rank<3>, ExecutionSpace>(space, {0, 0, 0}, + {n, n, n}), + KOKKOS_LAMBDA(int i, int j, int k) { + keys(i, j, k) = n - i; + values(i, j, k) = j; + }); +} + +TEST(TEST_CATEGORY, SortByKeyWithStrides) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + auto const n = 10; + + Kokkos::View<int ***, ExecutionSpace> keys("keys", n, n, n); + Kokkos::View<int ***, ExecutionSpace> values("values", n, n, n); + buildViewsForStrided(space, n, keys, values); + + auto keys_sub = Kokkos::subview(keys, Kokkos::ALL(), 1, 2); + auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6); + + auto keys_orig = Kokkos::create_mirror(space, keys_sub); + Kokkos::deep_copy(space, keys_orig, keys_sub); + + Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecutionSpace>(space, 0, n), + SortImpl::is_sorted_by_key_struct<ExecutionSpace, decltype(keys_sub), + decltype(values_sub)>( + keys_sub, keys_orig, values_sub), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); +} + +TEST(TEST_CATEGORY_DEATH, SortByKeyKeysLargerThanValues) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View<int *, ExecutionSpace> keys("keys", 3); + Kokkos::View<float *, ExecutionSpace> values("values", 1); + + ASSERT_DEATH( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values), + "values and keys extents must be the same"); + ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values, + SortImpl::Greater{}), + "values and keys extents must be the same"); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6ee4f2b129801066653739d7352870d7..208b46b15f273569125f5d3e6a694aa53d0c43f5 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -96,7 +96,7 @@ void fill_view(DestViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, aux_v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index fa4ff48dbef81c6927bfe4760c9dcd995346f6d3..d8b80675c9d8b725e29184f960b3d73661eee0e8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -173,7 +173,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { { auto res_it = KE::adjacent_find(exespace(), KE::cbegin(view), - KE::cend(view), args...); + KE::cend(view), args...); const auto my_diff = res_it - KE::cbegin(view); verify(my_diff, view, args...); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index b962218b5f0ca0a832dda9c8c91828ecfd948c11..dadce2d4748ace932c59d0805ff336817f99ca72 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -198,8 +198,9 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) { // this is needed for intel to avoid // error #1011: missing return statement at end of non-void function -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } @@ -238,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result( // set accum to 1 if a mismach is found const bool mismatch = memberValue != target; int accum = static_cast<int>(mismatch); - // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and - // ignores the reducer passed -#if defined KOKKOS_ENABLE_OPENMPTARGET - Kokkos::Sum<int> dummyReducer(accum); - const auto result = teamHandle.team_reduce(accum, dummyReducer); - return (result == 0); -#else teamHandle.team_reduce(Kokkos::Sum<int>(accum)); return (accum == 0); -#endif } template <class ValueType1, class ValueType2> @@ -541,10 +534,10 @@ void fill_views_inc(ViewType view, ViewHostType host_view) { } template <class ValueType, class ViewType> -std::enable_if_t<!std::is_same<typename ViewType::traits::array_layout, - Kokkos::LayoutStride>::value> +std::enable_if_t<!std::is_same_v<typename ViewType::traits::array_layout, + Kokkos::LayoutStride>> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same<ValueType, typename ViewType::value_type>::value, + static_assert(std::is_same_v<ValueType, typename ViewType::value_type>, "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { @@ -553,10 +546,10 @@ verify_values(ValueType expected, const ViewType view) { } template <class ValueType, class ViewType> -std::enable_if_t<std::is_same<typename ViewType::traits::array_layout, - Kokkos::LayoutStride>::value> +std::enable_if_t<std::is_same_v<typename ViewType::traits::array_layout, + Kokkos::LayoutStride>> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same<ValueType, typename ViewType::value_type>::value, + static_assert(std::is_same_v<ValueType, typename ViewType::value_type>, "Non-matching value types of view and reference value"); using non_strided_view_t = Kokkos::View<typename ViewType::value_type*>; @@ -573,11 +566,11 @@ verify_values(ValueType expected, const ViewType view) { } template <class ViewType1, class ViewType2> -std::enable_if_t<!std::is_same<typename ViewType2::traits::array_layout, - Kokkos::LayoutStride>::value> +std::enable_if_t<!std::is_same_v<typename ViewType2::traits::array_layout, + Kokkos::LayoutStride>> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same<typename ViewType1::value_type, - typename ViewType2::value_type>::value, + static_assert(std::is_same_v<typename ViewType1::value_type, + typename ViewType2::value_type>, "Non-matching value types of expected and actual view"); auto expected_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); @@ -590,11 +583,11 @@ compare_views(ViewType1 expected, const ViewType2 actual) { } template <class ViewType1, class ViewType2> -std::enable_if_t<std::is_same<typename ViewType2::traits::array_layout, - Kokkos::LayoutStride>::value> +std::enable_if_t<std::is_same_v<typename ViewType2::traits::array_layout, + Kokkos::LayoutStride>> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same<typename ViewType1::value_type, - typename ViewType2::value_type>::value, + static_assert(std::is_same_v<typename ViewType1::value_type, + typename ViewType2::value_type>, "Non-matching value types of expected and actual view"); using non_strided_view_t = Kokkos::View<typename ViewType2::value_type*>; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 386d533f7a8308244193bf66ce1b93edaaf8ca69..923ea970f91d76dc1a56c53a4c8681dd6e0e677b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,5 +81,109 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } +TEST(std_algorithms_DeathTest, expect_no_overlap) { + namespace KE = Kokkos::Experimental; + using value_type = double; + + static constexpr size_t extent0 = 13; + + //------------- + // 1d views + //------------- + using static_view_1d_t = Kokkos::View<value_type[extent0]>; + [[maybe_unused]] static_view_1d_t static_view_1d{ + "std-algo-test-1d-contiguous-view-static"}; + + using dyn_view_1d_t = Kokkos::View<value_type*>; + [[maybe_unused]] dyn_view_1d_t dynamic_view_1d{ + "std-algo-test-1d-contiguous-view-dynamic", extent0}; + + using strided_view_1d_t = Kokkos::View<value_type*, Kokkos::LayoutStride>; + Kokkos::LayoutStride layout1d{extent0, 2}; + strided_view_1d_t strided_view_1d{"std-algo-test-1d-strided-view", layout1d}; + +// Overlapping because iterators are identical +#if defined(KOKKOS_ENABLE_DEBUG) + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + auto first_s = KE::begin(static_view_1d); + auto last_s = first_s + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, + "Kokkos contract violation:.*"); + + auto first_d = KE::begin(dynamic_view_1d); + auto last_d = first_d + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d, last_d, first_d); }, + "Kokkos contract violation:.*"); + + auto first_st = KE::begin(strided_view_1d); + auto last_st = first_st + extent0; + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_st, last_st, first_st); }, + "Kokkos contract violation:.*"); +#endif + + // Ranges are overlapped + static constexpr size_t sub_extent0 = 6, offset0 = 3; + std::pair<size_t, size_t> range0(0, sub_extent0), + range1(offset0, offset0 + sub_extent0); +#if defined(KOKKOS_ENABLE_DEBUG) + auto static_view_1d_0 = Kokkos::subview(static_view_1d, range0); + auto static_view_1d_1 = Kokkos::subview(static_view_1d, range1); + auto first_s0 = KE::begin(static_view_1d_0); // [0, 6) + auto last_s0 = first_s0 + static_view_1d_0.extent(0); + auto first_s1 = KE::begin(static_view_1d_1); // [3, 9) + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s0, last_s0, first_s1); }, + "Kokkos contract violation:.*"); + + auto dynamic_view_1d_0 = Kokkos::subview(dynamic_view_1d, range0); + auto dynamic_view_1d_1 = Kokkos::subview(dynamic_view_1d, range1); + auto first_d0 = KE::begin(dynamic_view_1d_0); // [0, 6) + auto last_d0 = first_d0 + dynamic_view_1d_0.extent(0); + auto first_d1 = KE::begin(dynamic_view_1d_1); // [3, 9) + EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_d0, last_d0, first_d1); }, + "Kokkos contract violation:.*"); +#endif + + auto strided_view_1d_0 = Kokkos::subview(strided_view_1d, range0); + auto strided_view_1d_1 = Kokkos::subview(strided_view_1d, range1); + auto first_st0 = KE::begin(strided_view_1d_0); // [0, 12) + auto last_st0 = first_st0 + strided_view_1d_0.extent(0); + auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) + // Does not overlap since offset (=3) is not divisible by stride (=2) + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); + + // Iterating over the same range without overlapping + Kokkos::View<value_type[2][extent0], Kokkos::LayoutLeft> static_view_2d{ + "std-algo-test-2d-contiguous-view-static"}; + auto sub_static_view_1d_0 = Kokkos::subview(static_view_2d, 0, Kokkos::ALL); + auto sub_static_view_1d_1 = Kokkos::subview(static_view_2d, 1, Kokkos::ALL); + auto sub_first_s0 = KE::begin(sub_static_view_1d_0); // 0, 2, 4, ... + auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); + auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... + + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); + + Kokkos::View<value_type**, Kokkos::LayoutLeft> dynamic_view_2d{ + "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; + auto sub_dynamic_view_1d_0 = Kokkos::subview(dynamic_view_2d, 0, Kokkos::ALL); + auto sub_dynamic_view_1d_1 = Kokkos::subview(dynamic_view_2d, 1, Kokkos::ALL); + auto sub_first_d0 = KE::begin(sub_dynamic_view_1d_0); // 0, 2, 4, ... + auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); + auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... + + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); + + Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; + Kokkos::View<value_type**, Kokkos::LayoutStride> strided_view_2d{ + "std-algo-test-2d-contiguous-view-strided", layout2d}; + auto sub_strided_view_1d_0 = Kokkos::subview(strided_view_2d, 0, Kokkos::ALL); + auto sub_strided_view_1d_1 = Kokkos::subview(strided_view_2d, 1, Kokkos::ALL); + auto sub_first_st0 = KE::begin(sub_strided_view_1d_0); // 0, 6, 12, ... + auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); + auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... + + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); +} + } // namespace stdalgos } // namespace Test diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index 5778e37be04d160a8947f1669e52639484e0a21f..7c9e8f84bfa466772bcd4d8042703fbfd7803a6d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -107,7 +107,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name, } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -202,7 +202,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } @@ -224,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view<ValueType>(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } @@ -233,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view<ValueType>(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 6ab68a1987df17a837c9895a1a53d71075bbcb5a..a85e63fe3454c0787409b14607fbaff5370bf8fd 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -16,6 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> #include <utility> +#include <iomanip> namespace Test { namespace stdalgos { @@ -109,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -132,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init, } } -template <class ViewType1, class ViewType2, class ValueType, class BinaryOp> -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - ValueType init_value, BinaryOp bop) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h( - "goldh", data_view.extent(0)); - my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), init_value, bop); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - if (test_view_h.extent(0) > 0) { - for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same<gold_view_value_type, int>::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast<double>(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast<double>(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - } -} - template <class ValueType> struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -189,107 +149,153 @@ struct SumFunctor { } }; +struct VerifyData { + template <class ViewType1, class ViewType2, class ValueType, class BinaryOp> + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value, BinaryOp bop) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h( + "goldh", data_view.extent(0)); + my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), init_value, bop); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + if (test_view_h.extent(0) > 0) { + for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { + if (std::is_same<gold_view_value_type, int>::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast<double>(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast<double>(test_view_h(i)) << " " + << static_cast<double>(gold_h(i)); + } + } + } + } + + template <class ViewType1, class ViewType2, class ValueType> + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value) { + (*this)(data_view, test_view, init_value, SumFunctor<ValueType>()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template <class Tag, class ValueType, class InfoType> -void run_single_scenario_default_op(const InfoType& scenario_info, - ValueType init_value) { - using default_op = SumFunctor<ValueType>; +template <class Tag, class ValueType, class InfoType, class... OpOrEmpty> +void run_single_scenario(const InfoType& scenario_info, ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; auto view_dest = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan"); auto view_from = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value); + auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value, + empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } Kokkos::fence(); } -template <class Tag, class ValueType, class InfoType, class BinaryOp> -void run_single_scenario_custom_op(const InfoType& scenario_info, - ValueType init_value, BinaryOp bop) { +template <class Tag, class ValueType, class InfoType, class... OpOrEmpty> +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - auto view_dest = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan"); - auto view_from = create_view<ValueType>(Tag{}, view_ext, "exclusive_scan"); - fill_view(view_from, name); + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view<ValueType>(Tag{}, view_ext, "exclusive_scan_inplace_view1"); + fill_view(view1, name); + auto view2 = + create_view<ValueType>(Tag{}, view_ext, "exclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = - KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } Kokkos::fence(); @@ -303,34 +309,39 @@ void run_exclusive_scan_all_scenarios() { {"medium", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op<Tag, ValueType>(it, ValueType{0}); - run_single_scenario_default_op<Tag, ValueType>(it, ValueType{1}); - run_single_scenario_default_op<Tag, ValueType>(it, ValueType{-2}); - run_single_scenario_default_op<Tag, ValueType>(it, ValueType{3}); + run_single_scenario<Tag, ValueType>(it, ValueType{0}); + run_single_scenario<Tag, ValueType>(it, ValueType{1}); + run_single_scenario<Tag, ValueType>(it, ValueType{-2}); + run_single_scenario<Tag, ValueType>(it, ValueType{3}); + + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0}); + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2}); #if !defined KOKKOS_ENABLE_OPENMPTARGET // custom multiply op is only run for small views otherwise it overflows if (it.first == "small-a" || it.first == "small-b") { using custom_bop_t = MultiplyFunctor<ValueType>; - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{1}, - custom_bop_t()); - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{3}, - custom_bop_t()); - } + run_single_scenario<Tag, ValueType>(it, ValueType{0}, custom_bop_t()); + run_single_scenario<Tag, ValueType>(it, ValueType{1}, custom_bop_t()); + run_single_scenario<Tag, ValueType>(it, ValueType{-2}, custom_bop_t()); + run_single_scenario<Tag, ValueType>(it, ValueType{3}, custom_bop_t()); - using custom_bop_t = SumFunctor<ValueType>; - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{1}, + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0}, custom_bop_t()); - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op<Tag, ValueType>(it, ValueType{3}, + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2}, custom_bop_t()); + } + + using custom_bop_t = SumFunctor<ValueType>; + run_single_scenario<Tag, ValueType>(it, ValueType{0}, custom_bop_t()); + run_single_scenario<Tag, ValueType>(it, ValueType{1}, custom_bop_t()); + run_single_scenario<Tag, ValueType>(it, ValueType{-2}, custom_bop_t()); + run_single_scenario<Tag, ValueType>(it, ValueType{3}, custom_bop_t()); + + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0}, + custom_bop_t()); + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2}, + custom_bop_t()); #endif } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 793b98a67f16a81cc264514e8bfec41e197395cf..b24730ff00944540946cf8553368d3521184f780 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -55,7 +55,6 @@ void test_for_each(const ViewType view) { std::for_each(KE::begin(expected), KE::end(expected), non_mod_functor); compare_views(expected, view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const auto mod_lambda = KOKKOS_LAMBDA(value_t & i) { ++i; }; // pass view, lambda takes non-const ref @@ -79,7 +78,6 @@ void test_for_each(const ViewType view) { KE::for_each(exespace(), KE::cbegin(view), KE::cend(view), non_mod_lambda); std::for_each(KE::cbegin(expected), KE::cend(expected), non_mod_lambda); compare_views(expected, view); -#endif } // std::for_each_n is C++17, so we cannot compare results directly diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp index 8dbd6cd7e30bf9523ab4ee4902db51ff01fb8d69..2b3361743e4dfb23212731bbbe0ca849c61dde75 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp @@ -104,7 +104,7 @@ struct AssignIndexFunctor { template <class ValueType> struct IsEvenFunctor { - static_assert(std::is_integral<ValueType>::value, + static_assert(std::is_integral_v<ValueType>, "IsEvenFunctor uses operator%, so ValueType must be int"); KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 8e60a43e5ffb67e594daadf2b1d7b1a7e2b0e4a9..b4f40b4651d615cd0adf1660792b782e5bb56baa 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -16,6 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> #include <utility> +#include <iomanip> namespace Test { namespace stdalgos { @@ -109,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -143,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop, } } -template <class ViewType1, class ViewType2, class BinaryOp, class... Args> -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - BinaryOp bop, Args... args /* copy on purpose */) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h( - "goldh", data_view.extent(0)); - my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), bop, args...); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - - const auto ext = test_view_h.extent(0); - if (ext > 0) { - for (std::size_t i = 0; i < ext; ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same<gold_view_value_type, int>::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast<double>(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast<double>(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - // std::cout << " last el: " << test_view_h(ext-1) << std::endl; - } -} - template <class ValueType> struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -204,107 +160,151 @@ struct SumFunctor { } }; +struct VerifyData { + template <class ViewType1, class ViewType2, class BinaryOp, class... Args> + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + BinaryOp bop, Args... args /* copy on purpose */) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View<gold_view_value_type*, Kokkos::HostSpace> gold_h( + "goldh", data_view.extent(0)); + my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), bop, args...); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + + const auto ext = test_view_h.extent(0); + if (ext > 0) { + for (std::size_t i = 0; i < ext; ++i) { + if (std::is_same<gold_view_value_type, int>::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast<double>(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast<double>(test_view_h(i)) << " " + << static_cast<double>(gold_h(i)); + } + } + } + } + + template <class ViewType1, class ViewType2> + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view) // the view to test + { + using value_type = typename ViewType1::non_const_value_type; + (*this)(data_view, test_view, SumFunctor<value_type>()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template <class Tag, class ValueType, class InfoType> -void run_single_scenario_default_op(const InfoType& scenario_info) { - using default_op = SumFunctor<ValueType>; +template <class Tag, class ValueType, class InfoType, class... Args> +void run_single_scenario(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "inclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << std::endl; auto view_dest = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan"); auto view_from = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan(exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest); + auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest); + auto r = + KE::inclusive_scan("label", exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } Kokkos::fence(); } -template <class Tag, class ValueType, class InfoType, class BinaryOp, - class... Args> -void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, - Args... args /* copy on purpose */) { +template <class Tag, class ValueType, class InfoType, class... Args> +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // if (1 == sizeof...(Args)) { - // std::cout << "inclusive_scan custom op and init value: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } else { - // std::cout << "inclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place - auto view_dest = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan"); - auto view_from = create_view<ValueType>(Tag{}, view_ext, "inclusive_scan"); - fill_view(view_from, name); + auto view1 = + create_view<ValueType>(Tag{}, view_ext, "inclusive_scan_inplace_view1"); + fill_view(view1, name); + + auto view2 = + create_view<ValueType>(Tag{}, view_ext, "inclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } Kokkos::fence(); @@ -318,27 +318,35 @@ void run_inclusive_scan_all_scenarios() { {"medium-a", 313}, {"medium-b", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op<Tag, ValueType>(it); + run_single_scenario<Tag, ValueType>(it); + run_single_scenario_inplace<Tag, ValueType>(it); #if !defined KOKKOS_ENABLE_OPENMPTARGET // the sum custom op is always run using sum_binary_op = SumFunctor<ValueType>; sum_binary_op sbop; - run_single_scenario_custom_op<Tag, ValueType>(it, sbop); - run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{0}); - run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{1}); - run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{-2}); - run_single_scenario_custom_op<Tag, ValueType>(it, sbop, ValueType{3}); + run_single_scenario<Tag, ValueType>(it, sbop); + run_single_scenario<Tag, ValueType>(it, sbop, ValueType{0}); + run_single_scenario<Tag, ValueType>(it, sbop, ValueType{1}); + run_single_scenario<Tag, ValueType>(it, sbop, ValueType{-2}); + run_single_scenario<Tag, ValueType>(it, sbop, ValueType{3}); + + run_single_scenario_inplace<Tag, ValueType>(it, sbop, ValueType{0}); + run_single_scenario_inplace<Tag, ValueType>(it, sbop, ValueType{-2}); // custom multiply only for small views to avoid overflows if (it.first == "small-a" || it.first == "small-b") { using mult_binary_op = MultiplyFunctor<ValueType>; mult_binary_op mbop; - run_single_scenario_custom_op<Tag, ValueType>(it, mbop); - run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{0}); - run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{1}); - run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{-2}); - run_single_scenario_custom_op<Tag, ValueType>(it, mbop, ValueType{3}); + run_single_scenario<Tag, ValueType>(it, mbop); + run_single_scenario<Tag, ValueType>(it, mbop, ValueType{0}); + run_single_scenario<Tag, ValueType>(it, mbop, ValueType{1}); + run_single_scenario<Tag, ValueType>(it, mbop, ValueType{-2}); + run_single_scenario<Tag, ValueType>(it, mbop, ValueType{3}); + + run_single_scenario_inplace<Tag, ValueType>(it, mbop); + run_single_scenario_inplace<Tag, ValueType>(it, mbop, ValueType{0}); + run_single_scenario_inplace<Tag, ValueType>(it, mbop, ValueType{-2}); } #endif } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index f31d49e06b4ad297d2de370b08bb6967a292d90d..18928a35266996205aa6261128f9014db29623c5 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -122,7 +122,8 @@ bool compute_gold(const std::string& name) { } else if (name == "large-b") { return false; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return false; // unreachable } } @@ -146,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsA[3] = KE::is_sorted("label", exespace(), view); const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allA); + EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator<ValueType, ValueType> comp; @@ -154,12 +155,12 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[0] = KE::is_sorted(exespace(), KE::cbegin(view), KE::cend(view), comp); resultsB[1] = KE::is_sorted("label", exespace(), KE::cbegin(view), - KE::cend(view), comp); + KE::cend(view), comp); resultsB[2] = KE::is_sorted(exespace(), view, comp); resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allB); + EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{}); #endif Kokkos::fence(); @@ -173,9 +174,6 @@ void run_is_sorted_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario<Tag, ValueType>(it); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index dcfe8ad67e115a3d410b12ebf0f2ea10720d39ed..8327bfe13c0b083301f14eedc0ac3c0d621c2bf1 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -123,7 +123,8 @@ auto compute_gold(ViewType view, const std::string& name) { } else if (name == "large-b") { return KE::begin(view) + 156; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return KE::end(view); // unreachable } } @@ -145,10 +146,10 @@ void run_single_scenario(const InfoType& scenario_info) { KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view)); auto r3 = KE::is_sorted_until(exespace(), view); auto r4 = KE::is_sorted_until("label", exespace(), view); - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator<ValueType, ValueType> comp; @@ -160,10 +161,10 @@ void run_single_scenario(const InfoType& scenario_info) { auto r8 = KE::is_sorted_until("label", exespace(), view, comp); #endif - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); Kokkos::fence(); } @@ -176,9 +177,6 @@ void run_is_sorted_until_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted_until: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario<Tag, ValueType>(it); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index f3b3e269c446658c75363ac141391e8ebb842e94..df5df756d2ae6fd7467215cb47ebde089e7035b1 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -86,7 +86,7 @@ void run_single_scenario(ViewType view1, ViewType view2, v2_h(ext2 / 2) = -5; } } else { - throw std::runtime_error("Kokkos: stdalgo: test: mismatch: Invalid string"); + FAIL() << "Kokkos: stdalgo: test: mismatch: Invalid string"; } Kokkos::deep_copy(aux_view1, v1_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4604764097ebc1ff69f391a44364521acdefeaec..6918185bc083be489784356a619ef675f95349a0 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference<move_t>::value, ""); + static_assert(std::is_rvalue_reference_v<move_t>); // move constr MyMovableType b(std::move(a)); @@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove { void operator()(const int index) const { typename ViewType::value_type a{11}; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference<move_t>::value, ""); + static_assert(std::is_rvalue_reference<move_t>::value); m_view(index) = std::move(a); } @@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) { } } -// ------------ -// swap -// ------------ -TEST(std_algorithms_mod_ops_test, swap) { - { - int a = 1; - int b = 2; - KE::swap(a, b); - ASSERT_EQ(a, 2); - ASSERT_EQ(b, 1); - } - - { - double a = 3.; - double b = 1.; - KE::swap(a, b); - EXPECT_DOUBLE_EQ(a, 1.); - EXPECT_DOUBLE_EQ(b, 3.); - } -} - -template <class ViewType> -struct StdAlgoModSeqOpsTestSwap { - ViewType m_view; - - KOKKOS_INLINE_FUNCTION - void operator()(const int index) const { - typename ViewType::value_type newval{11}; - KE::swap(m_view(index), newval); - } - - StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {} -}; - -TEST(std_algorithms_mod_ops_test, swap_within_parfor) { - auto a = create_view<double>(stdalgos::DynamicTag{}, 10, "a"); - StdAlgoModSeqOpsTestSwap<decltype(a)> fnc(a); - Kokkos::parallel_for(a.extent(0), fnc); - auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a); - for (std::size_t i = 0; i < a.extent(0); ++i) { - EXPECT_DOUBLE_EQ(a_h(0), 11.); - } -} - // ------------ // iter_swap // ------------ diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index f80f30797e43cb5a45cdc255b9ef676a293caf2d..42a17d7377962635db9c99fbb025a9e587cfd628 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct std_algorithms_mod_seq_ops_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor<static_view_t>(m_static_view)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index b201ab95c1a6b967d1fe17731295c0a086aa5716..88e2a68ff17d1eaee222a7116a914e2ea6836eb8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -56,7 +56,7 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), - KE::end(v), KE::end(v2)); + KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); ASSERT_EQ(dist, 5); } else if (apiId == 2) { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index f169fd9ce881700b98154e484df9340eee4130a7..e47cacdd7d9cf37e48be91f21213c1d87837e950 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -95,7 +95,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same<value_type, typename ViewTypeDestTrue::value_type>::value, - ""); + std::is_same_v<value_type, typename ViewTypeDestTrue::value_type>); static_assert( - std::is_same<value_type, typename ViewTypeDestFalse::value_type>::value, - ""); + std::is_same_v<value_type, typename ViewTypeDestFalse::value_type>); const std::size_t ext = view_from.extent(0); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index c35fc5c24b20687d5d410c1d0ed61bb3b547ce66..f897e9b65749b5ac88eb284b951584bed9c79e67 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -99,7 +99,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -147,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove(exespace(), KE::begin(view), KE::end(view), - (ValueType)match_value); + (ValueType)match_value); verify_data(data_h, view, rit); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 3d7c52108be0578943dc0580fbe948f16cc73d35..3137880ea813a2c3eda41f2e1a8a31f9244f5ae8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index cb699aa923568e6c2c2079cae096fe92650f7332..d88ab5473de6b90f822fb7d37c73d68e5368b576 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index f06f2234eedb6bde1a17804fb3a4309a1632278f..e42788799e479806f502aff4fe4e52c739a5896b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -144,7 +144,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if(exespace(), KE::begin(view), KE::end(view), - remove_if_even); + remove_if_even); verify_data(data_h, view, rit, remove_if_even); } @@ -154,7 +154,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if("label", exespace(), KE::begin(view), - KE::end(view), remove_if_even); + KE::end(view), remove_if_even); verify_data(data_h, view, rit, remove_if_even); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index a22ab32d764ae46783c8a3f9d4b794df95733bb8..4596726cf3ce56f3355a308f5de21be1db9265ff 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -153,7 +153,7 @@ void verify_data(const std::string& name, ViewType1 test_view, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index a964ec8e173e7656a4b69fbfeb0a4e9adfa65da8..b18c859af593a49e6ca0d59f589bf1ee9f8066c2 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index ceeba889711953979009726e7e174ee1e996a4a0..82f859bac1243ead6746f243117fc55d8bdcdf47 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 802c0093c5ccc92c7d15da032854606793ae2fb9..5ae2ff4278539ed945ed77c10764217e2cbe35e7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -96,7 +96,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 6e6ca72783003d6f407b793b7fec1989c5b4c72a..3c934d64850c0e593fbec907515aa770f15f9602 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -62,7 +62,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 5638cbee4a621aec4dfcb80966674d233533e4c6..bf5c2ee7828b4942e918de817dec51f1ec8f861b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index d0caca7cea3f9576ff6814ef79d40ae0ef8db06f..1a860c58cee8ee56176d7f5fbc4b30d0499d1cb8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view<ValueType>(Tag{}, view_ext, "rotate_copy_dest"); auto n_it = KE::cbegin(view_from) + rotation_point; auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, - KE::cend(view_from), KE::begin(view_dest)); + KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index 021609c444d276dd7ba659262815f2057ad04582..195f88a0b737da08070c857190d4df0f93073615 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -256,7 +256,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, { auto myrit = KE::search(exespace(), KE::cbegin(view), KE::cend(view), - KE::cbegin(s_view), KE::cend(s_view), args...); + KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); ASSERT_EQ(mydiff, stddiff); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 53ad8daa2ec93779c3ee877827125fd1378d674c..79d88bec23f70341d2c6a07c7b6745fb148f2b76 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -154,7 +154,7 @@ void fill_view(ViewType dest_view, ValueType value, std::size_t count, } else { - throw std::runtime_error("Kokkos: test: search_n: this should not happen"); + FAIL() << "Kokkos: test: search_n: this should not happen"; } Kokkos::deep_copy(aux_view, v_h); @@ -208,7 +208,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), - KE::cend(view), count, value, args...); + KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); ASSERT_EQ(mydiff, stddiff); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 0b5fe9216eac36e5f3422efa61d3bd22b322d017..12835d5a2f7c636c4a422ed9e626fa56f1e2f7be 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -150,7 +150,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_left or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_left("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index 8e4ae94375902ccf9a8f7e0d899b8ab7cecb2844..3e350cf3b384f1bee094ee1e3e9a0b8cb82d7618 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -141,7 +141,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right(exespace(), KE::begin(view), KE::end(view), - shift_value); + shift_value); verify_data(rit, view, view_h, shift_value); } @@ -152,7 +152,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp index c388cadc9bba4a1095515d75cb452232474dea83..5a2c04693945d6b80a90582a574d7f0b5b6ffad7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp @@ -62,8 +62,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest)); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,8 +73,8 @@ struct TestFunctorA { case 1: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_binaryOp); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_binaryOp); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp index e24ac37bf0122dbf9dfc51dd2e5bc2c121c97144..071ecd5a9a807577373235bb1ede44de0cd850fd 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b32a9be3a1788c7b842ba9b8013371cf56816e4e..3f83ac7404fe31e9e8916f709ebab587db80e15b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -139,12 +139,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < sourceView.extent(0); ++i) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate); + KE::begin(rowDest), predicate); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); @@ -166,6 +166,10 @@ void run_all_scenarios() { } TEST(std_algorithms_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp index 7cbc788f8e3c1dae46fe408062e50dbb460e9cf5..9b509af55bf21108c01671d53d947cce7c9a4d45 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp @@ -53,7 +53,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount, - KE::begin(myRowViewDest)); + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp index 922424afbd98bdd1ee32f79c134215a1b8193ed0..38df5c30cec89c32b406e0103ce7077766f88030 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp @@ -111,7 +111,7 @@ void test_A(const bool searched_value_exist, std::size_t numTeams, using rand_pool = Kokkos::Random_XorShift64_Pool<Kokkos::DefaultHostExecutionSpace>; - rand_pool pool(lowerBound * upperBound); + rand_pool pool(static_cast<uint64_t>(lowerBound) * upperBound); if (searched_value_exist) { Kokkos::View<std::size_t*, Kokkos::DefaultHostExecutionSpace> randomIndices( diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index c6b2566c6cfb301d8a0ac9802054df99e37ab145..0c35c5e599343ff285fb60ac7731ca297795e107 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), - KE::cend(rowViewSrc), - KE::begin(rowViewDest), initVal); + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); resultDist = KE::distance(KE::begin(rowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -85,7 +85,7 @@ struct TestFunctorA { break; } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET case 2: { auto it = KE::exclusive_scan( @@ -121,7 +121,9 @@ struct TestFunctorA { } }; -template <class LayoutTag, class ValueType> +struct InPlace {}; + +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View<ValueType**> destView("destView", numTeams, numCols); - // exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + Kokkos::View<ValueType**> destView("destView", numTeams, numCols); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -207,7 +213,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { break; } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET case 2: case 3: { auto it = exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom), @@ -223,29 +229,42 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template <class LayoutTag, class ValueType> +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { #else for (int apiId : {0, 1}) { #endif - test_A<LayoutTag, ValueType>(numTeams, numCols, apiId); + test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId); } } } } TEST(std_algorithms_exclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); + + run_all_scenarios<DynamicTag, double, InPlace>(); + run_all_scenarios<StridedTwoRowsTag, int, InPlace>(); + run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>(); } } // namespace TeamExclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp index 430e4917e06e1afec8afd6fecd310cac7f0f7894..88c5e21f312fac17b5f6d1dddad93b68043c7d11 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp @@ -51,7 +51,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), searchedValue); + KE::cend(myRowViewFrom), searchedValue); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp index 83eca33569e1fecae2a5751ccf09ec8bd5626743..d350bc62cdb3a0bc010417cd6177f948672d8f9c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp @@ -86,9 +86,9 @@ struct TestFunctorA { case 2: { auto it = KE::find_end(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::cbegin(myRowSearchedSeqView), - KE::cend(myRowSearchedSeqView), m_binaryPred); + KE::cend(myRowViewFrom), + KE::cbegin(myRowSearchedSeqView), + KE::cend(myRowSearchedSeqView), m_binaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -99,7 +99,7 @@ struct TestFunctorA { case 3: { auto it = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView, - m_binaryPred); + m_binaryPred); resultDist = KE::distance(KE::begin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp index ee4bbed7a30d36ac7d36e94b1a2607c26aab609f..70f2be77f63273d3328bddbe986d76a9a000a5bc 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp index b9448c1a3e688f2bf9879bb20fc42a205ab98fb8..873e8faf4cad8d18e50e83c3ec59bac8e7feac29 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if_not(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp index 4b66dd9131fa128d7e2995e0cdb2f64d32e6e043..265cdf47461663be16ca5092ce856d77c1bbd375 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::generate_n(member, myRowView, m_count, - GenerateFunctor<value_type>()); + GenerateFunctor<value_type>()); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index 0daf9dbfe824f1f40cebfaf87b1a37efad36c445..b5f4cdd6123f9f069865110dbfb578d30a694e66 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -139,7 +139,9 @@ struct TestFunctorA { } }; -template <class LayoutTag, class ValueType> +struct InPlace {}; + +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View<ValueType**> destView("destView", numTeams, numCols); - // inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View<ValueType**> destView("destView", numTeams, numCols); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -251,25 +258,38 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef inclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template <class LayoutTag, class ValueType> +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3, 4, 5}) { - test_A<LayoutTag, ValueType>(numTeams, numCols, apiId); + test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId); } } } } TEST(std_algorithms_inclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); + + run_all_scenarios<DynamicTag, double, InPlace>(); + run_all_scenarios<StridedTwoRowsTag, int, InPlace>(); + run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>(); } } // namespace TeamInclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp index 1928f9558806524596b929144b167037ab38f618..21da333e75decfc9de3e72db9aba90ed44b9799a 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp @@ -191,7 +191,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, // ----------------------------------------------- auto returnView_h = create_host_space_copy(returnView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) { auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL()); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index f9adeb0654b83ae6ac101eb1d69f2695c6c794ff..f76a595b3f421b8c4539b0b6fabb80e93e2d6518 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -52,7 +52,7 @@ struct TestFunctorA { Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, KE::cbegin(myRowView), KE::cend(myRowView), @@ -62,7 +62,7 @@ struct TestFunctorA { } else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, myRowView, - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } @@ -179,7 +179,7 @@ template <class LayoutTag, class ValueType> void run_all_scenarios(bool makeDataSortedOnPurpose) { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5153}) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { #else for (int apiId : {0, 1}) { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index 33af5f99def666bf9af9ddc00408c638b582a7e0..5bc49e46007e4686e5c974281f20a742040f95f4 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -61,7 +61,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView)); + KE::cend(myRowView)); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,12 +73,12 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView), - CustomLessThanComparator<value_type>{}); + KE::cend(myRowView), + CustomLessThanComparator<value_type>{}); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -88,7 +88,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, myRowView, - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -210,7 +210,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator<ValueType>{}); + CustomLessThanComparator<ValueType>{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); @@ -226,7 +226,7 @@ template <class LayoutTag, class ValueType> void run_all_scenarios(const std::string& name, const std::vector<int>& cols) { for (int numTeams : teamSizesToTest) { for (const auto& numCols : cols) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { #else for (int apiId : {0, 1}) { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index fb891a8780fbe2dbeadbdff9f5fc586dc293afb9..452a48df21611a4a2b2525c8efc79b8f965bf166 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -59,7 +59,7 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, myRowView, - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::max_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } @@ -170,7 +170,7 @@ void run_all_scenarios() { } TEST(std_algorithms_max_element_team_test, test) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios<DynamicTag, int>(); run_all_scenarios<StridedTwoRowsTag, double>(); run_all_scenarios<StridedThreeRowsTag, int>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 4ba1b6f968bcf7ac1b1fa292e4d8c06bb469328e..2c79370b926e7fee2d6637fcc879e8065028e7fc 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -59,7 +59,7 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, myRowView, - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::min_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); @@ -169,7 +169,7 @@ void run_all_scenarios() { } TEST(std_algorithms_min_element_team_test, test) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios<DynamicTag, int>(); run_all_scenarios<StridedTwoRowsTag, double>(); run_all_scenarios<StridedThreeRowsTag, int>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 17562a55727b89f6561a9d0730b72830acfd21df..25a4487855b65cd2ca114f5c7b2f72cd9c86841d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -66,7 +66,7 @@ struct TestFunctorA { m_distancesView(myRowIndex, 1) = resultDist2; }); } -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto itPair = @@ -84,7 +84,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, myRowView, - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); resultDist1 = KE::distance(KE::begin(myRowView), itPair.first); resultDist2 = KE::distance(KE::begin(myRowView), itPair.second); @@ -160,7 +160,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } else { auto itPair = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator<value_type>{}); + CustomLessThanComparator<value_type>{}); stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first); stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } @@ -188,7 +188,7 @@ void run_all_scenarios() { } TEST(std_algorithms_minmax_element_team_test, test) { -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios<DynamicTag, int>(); run_all_scenarios<StridedTwoRowsTag, double>(); run_all_scenarios<StridedThreeRowsTag, int>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp index 1122d6d554ac270911cf4305fd242592df96294a..2c445dacf8e7709106af243d473db9218ad02b20 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::move(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp index c0bbdfa39041e0ef758564788eff7d70f184b8ef..78ab6bf1f8d1431873522c6b53f546605ee3ead6 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp @@ -240,7 +240,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, "stdDestTrueView", numTeams, numCols); Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestFalseView( "stdDestFalseView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) { auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp index 954d4612468d39f5754ec2e46e34ab59cf46d5fb..370e91cc1ff18d77621511f76130fae39617a0d7 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp @@ -197,7 +197,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, auto distancesView_h = create_host_space_copy(distancesView); auto dataViewAfterOp_h = create_host_space_copy(dataView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) { auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL()); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp index 94c2a8f1f9a7259c2870132a7a4f0dc4a71b77ea..eb00d9e083a29e30fd8e2b661b00438815a8cd78 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp @@ -16,7 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp index fb9c70391b3d6de84b5a5a06b2f576962bafe120..2defa1dc6fc8d55ce96f02db327f45125a70127a 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove(member, KE::begin(myRowView), KE::end(myRowView), - m_targetValue); + m_targetValue); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 24b840154b7383717c9602bbef0d709f0f3b4453..71a50e39e3ef46604b498a9298ae4c826190dbea 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove_copy(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_targetValue); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_targetValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -212,6 +212,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index 2082fa972880c51c5794942a5d68c3aef2f136a5..d5b5304f63150ee1d6956338209d4dcd0c5fed4a 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -65,8 +65,8 @@ struct TestFunctorA { GreaterThanValueFunctor predicate(m_threshold); if (m_apiPick == 0) { auto it = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), predicate); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), predicate); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) { auto rowFrom = Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); @@ -168,6 +168,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp index 3315f281da616e3ce05e2105c7a08d7a2dd83af0..3dd7cb764c61b5864001661db7f6dcc2d9fb6664 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp @@ -127,7 +127,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { // ----------------------------------------------- // check against std // ----------------------------------------------- - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); auto dataViewAfterOp_h = create_host_space_copy(dataView); auto distancesView_h = create_host_space_copy(distancesView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp index 70dbf10574b85a79378099f7bb85b94d161f2559..64f172e401cc0e8c810b61162347c266484377c3 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp @@ -78,7 +78,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy(member, myRowViewFrom, myRowViewDest, - m_targetValue, m_newValue); + m_targetValue, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -172,7 +172,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), targetVal, newVal); + KE::begin(rowDest), targetVal, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index ae43a2a4269cb75f1164e2106e952cbb57c1afa8..9c3699320d8cc1d68ae6d28fc053c0a9f712371c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -76,7 +76,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest, - predicate, m_newValue); + predicate, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -145,13 +145,13 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < sourceView.extent(0); ++i) { auto rowFrom = Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate, newVal); + KE::begin(rowDest), predicate, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp index 1d5d9578f948185bfd2fc39dfa21556a03b0c580..d79b53d3551405cf9e34883330bda4c3e500a165 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp @@ -103,7 +103,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j); } } - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor<ValueType> predicate(threshold); for (std::size_t i = 0; i < dataView.extent(0); ++i) { auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL()); std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp index e865b998f600321f775b042b981fcec6275b49bb..51f600fabad643987650c3570009fb0898e0a1b8 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp @@ -136,7 +136,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift, auto pivot = KE::cbegin(myRowFrom) + pivotShift; auto it = std::rotate_copy(KE::cbegin(myRowFrom), pivot, - KE::cend(myRowFrom), KE::begin(myRowDest)); + KE::cend(myRowFrom), KE::begin(myRowDest)); const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp index 00a80c5ef070e7cdc6b653f43d6887dc775c5d0e..08ff8fbbca6200ad5eab691d9dd4eee3f02f5079 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp @@ -47,7 +47,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::shift_right(member, KE::begin(myRowView), - KE::end(myRowView), m_shift); + KE::end(myRowView), m_shift); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp index 5fc9612caa7bc76795d4ad45a05d24f00841b875..60cb3f083779fda269f67af02847c48597ddf25f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp @@ -49,7 +49,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::swap_ranges(member, KE::begin(myRowView1), - KE::end(myRowView1), KE::begin(myRowView2)); + KE::end(myRowView1), KE::begin(myRowView2)); resultDist = KE::distance(KE::begin(myRowView2), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index 9f30812d8ef03cf40fc56d9f1041b2777f86e726..1c438543819dcae00c593b1403b0823fc72b1fc4 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -16,7 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { @@ -108,7 +108,9 @@ struct TestFunctorA { } }; -template <class LayoutTag, class ValueType> +struct InPlace {}; + +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View<ValueType**> destView("destView", numTeams, numCols); - // tranform_exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View<ValueType**> destView("destView", numTeams, numCols); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef transform_exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template <class LayoutTag, class ValueType> +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1}) { - test_A<LayoutTag, ValueType>(numTeams, numCols, apiId); + test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId); } } } @@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) { run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); + + run_all_scenarios<DynamicTag, double, InPlace>(); + run_all_scenarios<StridedTwoRowsTag, int, InPlace>(); + run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>(); } } // namespace TeamTransformExclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 4b3166023267927db16903e022cdf5f02dd54a3f..78a21c443055750b233b250f410d7aff9f8ca704 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -16,7 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { @@ -91,7 +91,7 @@ struct TestFunctorA { case 1: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp); + m_binaryOp, m_unaryOp); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -111,7 +111,7 @@ struct TestFunctorA { case 3: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp, initVal); + m_binaryOp, m_unaryOp, initVal); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -131,7 +131,9 @@ struct TestFunctorA { } }; -template <class LayoutTag, class ValueType> +struct InPlace {}; + +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy<space_t> policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View<ValueType**> destView("destView", numTeams, numCols); - // tranform_inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View<ValueType**> destView("destView", numTeams, numCols); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { } #undef transform_inclusive_scan - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v<InPlaceOrVoid, InPlace>) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template <class LayoutTag, class ValueType> +template <class LayoutTag, class ValueType, class InPlaceOrVoid = void> void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3}) { - test_A<LayoutTag, ValueType>(numTeams, numCols, apiId); + test_A<LayoutTag, ValueType, InPlaceOrVoid>(numTeams, numCols, apiId); } } } @@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) { run_all_scenarios<DynamicTag, double>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, unsigned>(); + + run_all_scenarios<DynamicTag, double, InPlace>(); + run_all_scenarios<StridedTwoRowsTag, int, InPlace>(); + run_all_scenarios<StridedThreeRowsTag, unsigned, InPlace>(); } } // namespace TeamTransformInclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp index b0a3241ec4bf84685987c76e088daa67691f2f31..17ded226aae0e9ccf9dca02100eee8a1d41d471c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp @@ -16,7 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> -#if not defined KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET namespace Test { namespace stdalgos { diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp index c46146e0a8f6f6dbc309f208ffdc3a881755faa6..cef0f7c13d07a039625a998d1e719bd491b65bb5 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp @@ -58,7 +58,7 @@ struct TestFunctorA { } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::unique(member, KE::begin(myRowView), KE::end(myRowView), - CustomEqualityComparator<value_type>{}); + CustomEqualityComparator<value_type>{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::begin(myRow), it); } else { auto it = std::unique(KE::begin(myRow), KE::end(myRow), - CustomEqualityComparator<value_type>{}); + CustomEqualityComparator<value_type>{}); stdDistance = KE::distance(KE::begin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 87687b60a16e13ceb58ff2a3d53aacdb3e6e0ec1..89ea8154c7ecaec49c8c528485b967d7a8c80db5 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -72,8 +72,8 @@ struct TestFunctorA { using comparator_t = CustomEqualityComparator<typename SourceViewType::value_type>; auto it = KE::unique_copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), - KE::begin(myRowViewDest), comparator_t()); + KE::end(myRowViewFrom), + KE::begin(myRowViewDest), comparator_t()); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -159,12 +159,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { std::size_t stdDistance = 0; if (apiId <= 1) { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest)); + KE::begin(myRowDest)); stdDistance = KE::distance(KE::begin(myRowDest), it); } else { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest), - CustomEqualityComparator<value_type>{}); + KE::begin(myRowDest), + CustomEqualityComparator<value_type>{}); stdDistance = KE::distance(KE::begin(myRowDest), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); @@ -186,6 +186,10 @@ void run_all_scenarios() { } TEST(std_algorithms_unique_copy_team_test, test) { + // FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios<DynamicTag, int>(); run_all_scenarios<StridedTwoRowsTag, int>(); run_all_scenarios<StridedThreeRowsTag, int>(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 9dac3ce75ffa9fdc40347850527305fbb74abff2..365ca21688b4e8db2e51c349c7ba77b77c3b74ba 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -16,6 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> #include <utility> +#include <iomanip> namespace Test { namespace stdalgos { @@ -114,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -160,24 +161,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same<gold_view_value_type, int>::value) { + if (std::is_same_v<gold_view_value_type, int>) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast<double>(test_view_h(i)) << " " + << static_cast<double>(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -205,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, BinaryOp bop, UnaryOp uop) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "transform_exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - - auto view_dest = - create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan"); - auto view_from = - create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan"); + + auto view_from = create_view<ValueType>(Tag{}, view_ext, + "transform_exclusive_scan_view_from"); fill_view(view_from, name); + auto view_dest = create_view<ValueType>(Tag{}, view_ext, + "transform_exclusive_scan_view_dest"); { fill_zero(view_dest); auto r = KE::transform_exclusive_scan( @@ -253,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, Kokkos::fence(); } +template <class Tag, class ValueType, class InfoType, class BinaryOp, + class UnaryOp> +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, BinaryOp bop, + UnaryOp uop) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan_view1"); + fill_view(view1, name); + + auto view2 = + create_view<ValueType>(Tag{}, view_ext, "transform_exclusive_scan_view2"); + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan( + "label", exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value, + bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2, + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + Kokkos::fence(); +} + template <class Tag, class ValueType> void run_all_scenarios() { const std::map<std::string, std::size_t> scenarios = { @@ -267,6 +314,11 @@ void run_all_scenarios() { run_single_scenario<Tag, ValueType>(it, ValueType{1}, bop_t(), uop_t()); run_single_scenario<Tag, ValueType>(it, ValueType{-2}, bop_t(), uop_t()); run_single_scenario<Tag, ValueType>(it, ValueType{3}, bop_t(), uop_t()); + + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{0}, bop_t(), + uop_t()); + run_single_scenario_inplace<Tag, ValueType>(it, ValueType{-2}, bop_t(), + uop_t()); } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index a90a68ca1d7588932bbffd034b890824e69ba0ec..cc8726214786a4a2c8b16c92ede148bc5acd6174 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -16,6 +16,7 @@ #include <TestStdAlgorithmsCommon.hpp> #include <utility> +#include <iomanip> namespace Test { namespace stdalgos { @@ -114,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -172,24 +173,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same<gold_view_value_type, int>::value) { + if (std::is_same_v<gold_view_value_type, int>) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast<double>(test_view_h(i)) << " " + << static_cast<double>(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -210,30 +202,11 @@ struct SumBinaryFunctor { std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template <class Tag, class BopT, class UopT> -void print_scenario_details(const std::string& name, BopT bop, UopT uop) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << std::endl; -} - -template <class Tag, class BopT, class UopT, class ValueType> -void print_scenario_details(const std::string& name, BopT bop, UopT uop, - ValueType init_value) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << ", " - << "init = " << init_value << std::endl; -} - template <class Tag, class ValueType, class InfoType, class... Args> void run_single_scenario(const InfoType& scenario_info, Args... args /* by value on purpose*/) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // print_scenario_details<Tag>(name, args...); auto view_dest = create_view<ValueType>(Tag{}, view_ext, "transform_inclusive_scan"); @@ -278,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info, Kokkos::fence(); } +template <class Tag, class ValueType, class InfoType, class... Args> +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* by value on purpose*/) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to scenario and is not modified + // view2: filled according scenario and used for the in-place op + // Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan. + // NOTE: view2 must be filled before every call to the algorithm + // because the algorithm acts in place + + auto view_1 = create_view<ValueType>(Tag{}, view_ext, + "transform_inclusive_scan_view_1"); + fill_view(view_1, name); + + auto view_2 = create_view<ValueType>(Tag{}, view_ext, + "transform_inclusive_scan_view_2"); + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2), + KE::cend(view_2), KE::begin(view_2), + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), + KE::cbegin(view_2), KE::cend(view_2), + KE::begin(view_2), args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2, + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + Kokkos::fence(); +} + template <class Tag, class ValueType> void run_all_scenarios() { const std::map<std::string, std::size_t> scenarios = { @@ -294,15 +324,23 @@ void run_all_scenarios() { run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{2}); run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{-1}); run_single_scenario<Tag, ValueType>(it, bop_t(), uop_t(), ValueType{-2}); + + run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t()); + run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(), + ValueType{0}); + run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(), + ValueType{2}); + run_single_scenario_inplace<Tag, ValueType>(it, bop_t(), uop_t(), + ValueType{-2}); } } #if !defined KOKKOS_ENABLE_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { run_all_scenarios<DynamicTag, double>(); - // run_all_scenarios<StridedThreeTag, double>(); - // run_all_scenarios<DynamicTag, int>(); - // run_all_scenarios<StridedThreeTag, int>(); + run_all_scenarios<StridedThreeTag, double>(); + run_all_scenarios<DynamicTag, int>(); + run_all_scenarios<StridedThreeTag, int>(); } #endif diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 9c5ae0cf8a1e465c955f12797aa35269a53be9df..6ee93e3d5fa02b68a83269baa960c6dececcdde5 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -138,7 +138,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 3cf43ad4db8ff96dd9bbe3d66e563c822258000e..e3e969645839295e67fa19aca61cf49ee98f259c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -146,7 +146,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -235,7 +235,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp index 3847e1e6a3663c0aca1418638dd111e3e787fea3..0044b935587fb86ae12885bfd5c18fb5cbf103e5 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -72,7 +72,7 @@ auto create_host_view_with_reduction_order_indices( result(8) = 7; result(9) = 5; } else { - throw std::runtime_error("test: Invalid enum"); + Kokkos::abort("test: Invalid enum"); } return result; @@ -80,12 +80,9 @@ auto create_host_view_with_reduction_order_indices( template <int flag, class ExeSpace, class IndexType, class ViewType> auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { - static_assert(std::is_same<ExeSpace, Kokkos::HostSpace>::value, + static_assert(std::is_same_v<ExeSpace, Kokkos::HostSpace>, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = std::conditional_t< (flag == 0), Kokkos::MaxFirstLoc<view_value_type, IndexType, ExeSpace>, @@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) { const auto pair1 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::LeftToRight); - ASSERT_EQ(pair1.first, gold_value); - ASSERT_EQ(pair1.second, gold_location); + ASSERT_EQ(pair1.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); + ASSERT_EQ(pair1.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); const auto pair2 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::RightToLeft); - ASSERT_EQ(pair2.first, gold_value); - ASSERT_EQ(pair2.second, gold_location); + ASSERT_EQ(pair2.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); + ASSERT_EQ(pair2.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); const auto pair3 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::Random); - ASSERT_EQ(pair3.first, gold_value); - ASSERT_EQ(pair3.second, gold_location); + ASSERT_EQ(pair3.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::Random); + ASSERT_EQ(pair3.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::Random); } TEST(std_algorithms_reducers, min_first_loc) { @@ -188,12 +191,9 @@ template <class ExeSpace, class IndexType, class ViewType, class ValuesPair, class IndexPair> void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, const ValuesPair gold_values, const IndexPair gold_locs) { - static_assert(std::is_same<ExeSpace, Kokkos::HostSpace>::value, + static_assert(std::is_same_v<ExeSpace, Kokkos::HostSpace>, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = Kokkos::MinMaxFirstLastLoc<view_value_type, IndexType, ExeSpace>; @@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, reduction_value_type{view(index), view(index), index, index}); } - ASSERT_EQ(red_result.min_val, gold_values.first); - ASSERT_EQ(red_result.max_val, gold_values.second); - ASSERT_EQ(red_result.min_loc, gold_locs.first); - ASSERT_EQ(red_result.max_loc, gold_locs.second); + ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue); + ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue); } TEST(std_algorithms_reducers, min_max_first_last_loc) { diff --git a/packages/kokkos/benchmarks/CMakeLists.txt b/packages/kokkos/benchmarks/CMakeLists.txt index 42279bf55db83ad5a0aeade86e18a918d88370a6..968c8ae3bf59826899167e67d13847a3a094ae40 100644 --- a/packages/kokkos/benchmarks/CMakeLists.txt +++ b/packages/kokkos/benchmarks/CMakeLists.txt @@ -1 +1,12 @@ -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) +#FIXME_OPENMPTARGET - compiling in debug mode causes ICE. +kokkos_add_benchmark_directories(atomic) +kokkos_add_benchmark_directories(gather) +kokkos_add_benchmark_directories(gups) +kokkos_add_benchmark_directories(launch_latency) +kokkos_add_benchmark_directories(stream) +kokkos_add_benchmark_directories(view_copy_constructor) +#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. +if(NOT Kokkos_ENABLE_OPENMPTARGET) + kokkos_add_benchmark_directories(policy_performance) + kokkos_add_benchmark_directories(bytes_and_flops) +endif() diff --git a/packages/kokkos/benchmarks/atomic/CMakeLists.txt b/packages/kokkos/benchmarks/atomic/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fda2bf6f6a409b6d00beb6089488500f0d9c1bc --- /dev/null +++ b/packages/kokkos/benchmarks/atomic/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_executable(atomic SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c65d06ce28eadfd298bd42b8ba478a537c1ad96 --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -0,0 +1,9 @@ +kokkos_add_executable( + bytes_and_flops + SOURCES + bench_double.cpp + bench_float.cpp + bench_int32_t.cpp + bench_int64_t.cpp + main.cpp +) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp index 2589fd7309b29244c764b61b0335f3a0f2d756f9..88830af624b6a80e9ead054b8e47211d560cca8e 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp @@ -37,22 +37,22 @@ struct RunStride { }; #define STRIDE 1 -#include <bench_stride.hpp> +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 2 -#include <bench_stride.hpp> +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 4 -#include <bench_stride.hpp> +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 8 -#include <bench_stride.hpp> +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 16 -#include <bench_stride.hpp> +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 32 -#include <bench_stride.hpp> +#include "bench_stride.hpp" #undef STRIDE template <class Scalar> diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp index f955c996660a9ccefa3b0084f12ff140d3bfcebd..2fda1ae3d427ec9264fe2556b29e44c418846889 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include <bench.hpp> +#include "bench.hpp" template void run_stride_unroll<double>(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp index 137ff67d40408f0ff19ef7ed174adf1b7cbd0194..3210116a9ee7399ad920f9167a2e9fb90ae1d873 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include <bench.hpp> +#include "bench.hpp" template void run_stride_unroll<float>(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp index 29ccec014149592652247e33bf9e864a50f67bb3..24a5dcd389973f2382fbb02531a6ba35808481af 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include <bench.hpp> +#include "bench.hpp" template void run_stride_unroll<int32_t>(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp index c153d5eff3974fb63b84121346ea20ff96ae9361..0634700c31e110b2b37b5357d47bf70105b1e554 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include <bench.hpp> +#include "bench.hpp" template void run_stride_unroll<int64_t>(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp index b63d486fc9e40a9b327be6316cf0e44387adee64..80f017fbe8fc9d2b237f2339b5db140bcaa021a4 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp @@ -15,28 +15,28 @@ //@HEADER #define UNROLL 1 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 2 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 3 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 4 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 5 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 6 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 7 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 8 -#include <bench_unroll_stride.hpp> +#include "bench_unroll_stride.hpp" #undef UNROLL template <class Scalar> diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 0f7a298c1bb66758e991727b1f021247a7b133ec..762cc988f14ead6151e2f4afa1bb9eca5912d5f8 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -17,16 +17,16 @@ template <class Scalar> struct Run<Scalar, UNROLL, STRIDE> { static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) { - Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> A("A", N, K); - Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> B("B", N, K); - Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> C("C", N, K); + Kokkos::View<Scalar** [STRIDE], Kokkos::LayoutRight> A("A", N, K); + Kokkos::View<Scalar** [STRIDE], Kokkos::LayoutRight> B("B", N, K); + Kokkos::View<Scalar** [STRIDE], Kokkos::LayoutRight> C("C", N, K); Kokkos::deep_copy(A, Scalar(1.5)); Kokkos::deep_copy(B, Scalar(2.5)); Kokkos::deep_copy(C, Scalar(3.5)); Kokkos::Timer timer; - for (int i = 0; i < I; ++i) { + for (int iter = 0; iter < I; ++iter) { Kokkos::parallel_for( "BenchmarkKernel", Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)), diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp index 20077757d1ffa4557bf404ee8be4976edcf79cf8..fdfcc4ea64ff4f7755a332c0a002848e7917526d 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -16,7 +16,7 @@ #include <Kokkos_Core.hpp> #include <Kokkos_Timer.hpp> -#include <bench.hpp> +#include "bench.hpp" #include <cstdlib> extern template void run_stride_unroll<float>(int, int, int, int, int, int, int, @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) { printf("D must be one of 1,2,4,8,16,32\n"); return 0; } - if ((P < 1) && (P > 2)) { + if ((P < 1) || (P > 4)) { printf("P must be one of 1,2,3,4\n"); return 0; } diff --git a/packages/kokkos/benchmarks/gather/CMakeLists.txt b/packages/kokkos/benchmarks/gather/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2de1ce85e6378d97469979a09ee5aafc330049cd --- /dev/null +++ b/packages/kokkos/benchmarks/gather/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_executable(gather SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/gather/gather.hpp b/packages/kokkos/benchmarks/gather/gather.hpp index d83461702c7863130803be76276a3d15abab571d..90b1101c1d5e51df4bfe9f76ea43933ad8b36bf6 100644 --- a/packages/kokkos/benchmarks/gather/gather.hpp +++ b/packages/kokkos/benchmarks/gather/gather.hpp @@ -20,28 +20,28 @@ struct RunGather { }; #define UNROLL 1 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 2 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 3 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 4 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 5 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 6 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 7 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 8 -#include <gather_unroll.hpp> +#include "gather_unroll.hpp" #undef UNROLL template <class Scalar> diff --git a/packages/kokkos/benchmarks/gather/gather_unroll.hpp b/packages/kokkos/benchmarks/gather/gather_unroll.hpp index 5ee5742a3f72f7baa76936ace5e863275700c104..1aa73091bc5ab1792ac367ba1c0a2abddff30002 100644 --- a/packages/kokkos/benchmarks/gather/gather_unroll.hpp +++ b/packages/kokkos/benchmarks/gather/gather_unroll.hpp @@ -138,7 +138,7 @@ struct RunGather<Scalar, UNROLL> { printf( "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " "%lf GGather/s: %lf\n", - sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds, + static_cast<int>(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds, 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds, 1.e-9 * gather_ops / seconds); } diff --git a/packages/kokkos/benchmarks/gather/main.cpp b/packages/kokkos/benchmarks/gather/main.cpp index 7f4fc9ede6ce1c1c7b2dced824a19b984a4bc421..07fca9fdc64dfa905761d474d8f2e5f57d373966 100644 --- a/packages/kokkos/benchmarks/gather/main.cpp +++ b/packages/kokkos/benchmarks/gather/main.cpp @@ -16,7 +16,7 @@ #include <Kokkos_Core.hpp> #include <Kokkos_Timer.hpp> -#include <gather.hpp> +#include "gather.hpp" #include <cstdlib> int main(int argc, char* argv[]) { diff --git a/packages/kokkos/benchmarks/gups/CMakeLists.txt b/packages/kokkos/benchmarks/gups/CMakeLists.txt index 8de5b73cc67f9b3fcd0ff9d9ae1d49efdc6d789a..dc7074702925f83be063f43db652f3cd64096942 100644 --- a/packages/kokkos/benchmarks/gups/CMakeLists.txt +++ b/packages/kokkos/benchmarks/gups/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gups - SOURCES gups.cpp -) +kokkos_add_executable(gups SOURCES gups.cpp) diff --git a/packages/kokkos/benchmarks/gups/gups.cpp b/packages/kokkos/benchmarks/gups/gups.cpp index 369052321d7b0c44099e4707006b85c7dbcc1be7..e00f87968bde036b18dd86e7497f0426d3d1006e 100644 --- a/packages/kokkos/benchmarks/gups/gups.cpp +++ b/packages/kokkos/benchmarks/gups/gups.cpp @@ -140,7 +140,7 @@ int run_benchmark(const Index indicesCount, const Index dataCount, break; } default: { - throw std::runtime_error("unexpected mode"); + Kokkos::abort("unexpected mode"); } } diff --git a/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4775bf2261e0a9166d50dbf705491806400f2422 --- /dev/null +++ b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_executable(launch_latency SOURCES launch_latency.cpp) diff --git a/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp new file mode 100644 index 0000000000000000000000000000000000000000..156c29af09e6bcd113dc4155e9fcceb363c9c6c1 --- /dev/null +++ b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -0,0 +1,283 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file launch_latency.cpp + + Tests of parallel_for and parallel_reduce latency for different + circumstances. + + Three launch kinds are tested: parallel_for, parallel_reduce into scalar, + and parallel_reduce into view + + N controls how large the parallel loops is + V controls how large the functor is + M controls across how many launches the latency is averaged + K controls how larege the nested loop is (no larger than V) + + For each launch kind, + 1. Avg functor dispatch latency: (time to do M launches) / M + 2. Avg functor completion throughput: (M launches + sync) / M + 3. Avg functor completion latency: (M (launch + sync)) / M +*/ + +#include <Kokkos_Core.hpp> + +template <int V> +struct TestFunctor { + double values[V]; + Kokkos::View<double*> a; + int K; + TestFunctor(Kokkos::View<double*> a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + } +}; + +template <int V> +struct TestRFunctor { + double values[V]; + Kokkos::View<double*> a; + int K; + TestRFunctor(Kokkos::View<double*> a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double& lsum) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + lsum += a(i); + } +}; + +struct Opts { + bool par_for = true; + bool par_reduce = true; + bool par_reduce_view = true; +}; + +template <int V> +void run(int N, int M, int K, const Opts& opts) { + std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence, + l_red_view_no_fence, l_red_view_fence; + { + std::ostringstream ostream; + ostream << "RunNoFence_" << N << "_" << K << std::endl; + l_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunFence_" << N << "_" << K << std::endl; + l_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceNoFence_" << N << "_" << K << std::endl; + l_red_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceFence_" << N << "_" << K << std::endl; + l_red_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl; + l_red_view_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewFence_" << N << "_" << K << std::endl; + l_red_view_fence = ostream.str(); + } + + double result; + Kokkos::View<double*> a("A", N); + Kokkos::View<double> v_result("result"); + TestFunctor<V> f(a, K); + TestRFunctor<V> rf(a, K); + Kokkos::Timer timer; + + // initialize to an obviously wrong value + double time_no_fence = -1; // launch loop + double time_no_fence_fenced = -1; // launch loop then fence + double time_fence = -1; // launch&fence loop + + double time_red_no_fence = -1; + double time_red_no_fence_fenced = -1; + double time_red_fence = -1; + + double time_red_view_no_fence = -1; + double time_red_view_no_fence_fenced = -1; + double time_red_view_fence = -1; + + if (opts.par_for) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_for(l_no_fence, N, f); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_no_fence, N, f); + } + time_no_fence = timer.seconds(); + Kokkos::fence(); + time_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_fence, N, f); + Kokkos::fence(); + } + time_fence = timer.seconds(); + } + + if (opts.par_reduce) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + time_red_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_fence, N, rf, result); + Kokkos::fence(); + } + time_red_fence = timer.seconds(); + Kokkos::fence(); + } + + if (opts.par_reduce_view) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + time_red_view_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_view_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result); + Kokkos::fence(); + } + time_red_view_fence = timer.seconds(); + Kokkos::fence(); + timer.reset(); + } + + const double x = 1.e6 / M; + printf("%i %i %i %i", N, V, K, M); + if (opts.par_for) { + printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence, + x * time_no_fence_fenced); + } + if (opts.par_reduce) { + printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence, + x * time_red_fence, x * time_red_no_fence_fenced); + } + if (opts.par_reduce_view) { + printf(" parallel_reduce(view): %lf %lf ( %lf )", + x * time_red_view_no_fence, x * time_red_view_fence, + x * time_red_view_no_fence_fenced); + } + printf("\n"); +} +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int N = 10000; + int M = 20; + int K = 1; + + Opts opts; + + printf("==========================\n"); + printf("Kokkos Launch Latency Test\n"); + printf("==========================\n"); + printf("\n"); + printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]); + printf("Arguments: N M K\n"); + printf(" N: loop length\n"); + printf(" M: how many kernels to dispatch\n"); + printf( + " K: nested loop length (capped by size of functor member array\n\n"); + printf("Options:\n"); + printf(" --no-parallel-for: skip parallel_for benchmark\n"); + printf(" --no-parallel-reduce: skip parallel_reduce benchmark\n"); + printf( + " --no-parallel-reduce-view: skip parallel_reduce into view " + "benchmark\n"); + printf("\n\n"); + printf(" Output V is the size of the functor member array\n"); + printf("\n\n"); + + for (int i = 1; i < argc; ++i) { + const std::string_view arg(argv[i]); + + // anything that doesn't start with -- + if (arg.size() < 2 || + (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) { + if (i == 1) + N = atoi(arg.data()); + else if (i == 2) + M = atoi(arg.data()); + else if (i == 3) + K = atoi(arg.data()); + else { + Kokkos::abort("unexpected argument!"); + } + } else if (arg == "--no-parallel-for") { + opts.par_for = false; + } else if (arg == "--no-parallel-reduce") { + opts.par_reduce = false; + } else if (arg == "--no-parallel-reduce-view") { + opts.par_reduce_view = false; + } else { + std::stringstream ss; + ss << "unexpected argument \"" << arg << "\" at position " << i; + Kokkos::abort(ss.str().c_str()); + } + } + + printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n"); + + /* A backend may have different launch strategies for functors of different + * sizes: test a variety of functor sizes.*/ + run<1>(N, M, K <= 1 ? K : 1, opts); + run<16>(N, M, K <= 16 ? K : 16, opts); + run<200>(N, M, K <= 200 ? K : 200, opts); + run<3000>(N, M, K <= 3000 ? K : 3000, opts); + run<30000>(N, M, K <= 30000 ? K : 30000, opts); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a939775c0bc2a74ec034f7be05e4ee2cfa2f6d3 --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_executable(policy_performance SOURCES main.cpp) diff --git a/packages/kokkos/benchmarks/policy_performance/main.cpp b/packages/kokkos/benchmarks/policy_performance/main.cpp index 28cfde552a59c987efe83d140de379dd893b2891..0983a3d535c9d066749206cd3dc8b39b5b82b6fe 100644 --- a/packages/kokkos/benchmarks/policy_performance/main.cpp +++ b/packages/kokkos/benchmarks/policy_performance/main.cpp @@ -106,8 +106,9 @@ int main(int argc, char* argv[]) { Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, - double& lval) { lval += 1; }, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) { + lval += 1; + }, result); using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>; diff --git a/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp index cc2cc40257b86cf0813b8dca6d64733b14ae2ba8..0e23d221f671bd1eedf8da1aa30e429d9e5e403f 100644 --- a/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp +++ b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -21,13 +21,13 @@ struct ParallelScanFunctor { using value_type = double; ViewType v; - ParallelScanFunctor(const ViewType& v_) : v(v_) {} + explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {} KOKKOS_INLINE_FUNCTION - void operator()(const int idx, value_type& val, const bool& final) const { + void operator()(const int idx, value_type& val, const bool& is_final) const { // inclusive scan val += v(idx); - if (final) { + if (is_final) { v(idx) = val; } } @@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range, vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); } v2(idx, t) = vector_result; @@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range, team_result = 0.0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { lval += 1; }, team_result); + [&](const int, double& lval) { lval += 1; }, team_result); } v1(idx) = team_result; // prevent compiler optimizing loop away @@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range, for (int tr = 0; tr < thread_repeat; ++tr) { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { + [&](const int, double& lval) { double vector_result = 0.0; for (int vr = 0; vr < inner_repeat; ++vr) { vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); lval += vector_result; } diff --git a/packages/kokkos/benchmarks/stream/CMakeLists.txt b/packages/kokkos/benchmarks/stream/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b096976c486f618438fc04799431dc239e10f273 --- /dev/null +++ b/packages/kokkos/benchmarks/stream/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_executable(stream SOURCES stream-kokkos.cpp) diff --git a/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7bbc13b6ec55e261b802fe6867c690ec9e34353 --- /dev/null +++ b/packages/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_executable(view_copy_constructor SOURCES view_copy_constructor.cpp) diff --git a/packages/kokkos/benchmarks/view_copy_constructor/Makefile b/packages/kokkos/benchmarks/view_copy_constructor/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..70c6d517e0d32a9e8b225f58e015911a4f31c25f --- /dev/null +++ b/packages/kokkos/benchmarks/view_copy_constructor/Makefile @@ -0,0 +1,46 @@ +KOKKOS_DEVICES=Serial +KOKKOS_ARCH = "" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +CXX = clang++ +EXE = view_copy_constructor.exe + +CXXFLAGS ?= -Ofast +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -Ofast +KOKKOS_CXX_STANDARD=c++20 + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o view_copy_constructor.cuda view_copy_constructor.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp b/packages/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..63c49f09c01e5b137f549b0bfec68276b6457b2b --- /dev/null +++ b/packages/kokkos/benchmarks/view_copy_constructor/view_copy_constructor.cpp @@ -0,0 +1,310 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// The function "test_view_collection" exposes the copy constructor +// and destructor overheads in Kokkos View objects +// Please see the lines marked by "NOTE". + +#include <limits> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <sys/time.h> +#include <Kokkos_Core.hpp> +#include <iostream> + +// NVIEWS is the number of Kokkos View objects in our ViewCollection object +// We have chosen a large value of 40 to make it easier to see performance +// differences when using the likelihood attribute +#define NVIEWS 40 + +class ViewCollection { + public: + Kokkos::View<double*> v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, + v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40; + double m_expected_sum; + double m_side_effect; + int m_N; + + ViewCollection(int N) + : v1("v1", N), + v2("v2", N), + v3("v3", N), + v4("v4", N), + v5("v5", N), + v6("v6", N), + v7("v7", N), + v8("v8", N), + v9("v9", N), + v10("v10", N), + v11("v11", N), + v12("v12", N), + v13("v13", N), + v14("v14", N), + v15("v15", N), + v16("v16", N), + v17("v17", N), + v18("v18", N), + v19("v19", N), + v20("v20", N), + v21("v21", N), + v22("v22", N), + v23("v23", N), + v24("v24", N), + v25("v25", N), + v26("v26", N), + v27("v27", N), + v28("v28", N), + v29("v29", N), + v30("v30", N), + v31("v31", N), + v32("v32", N), + v33("v33", N), + v34("v34", N), + v35("v35", N), + v36("v36", N), + v37("v37", N), + v38("v38", N), + v39("v39", N), + v40("v40", N), + m_expected_sum(N * NVIEWS), + m_side_effect(0.0), + m_N(N) { + for (int i = 0; i < N; ++i) { + v1(i) = 1; + v2(i) = 1; + v3(i) = 1; + v4(i) = 1; + v5(i) = 1; + v6(i) = 1; + v7(i) = 1; + v8(i) = 1; + v9(i) = 1; + v10(i) = 1; + v11(i) = 1; + v12(i) = 1; + v13(i) = 1; + v14(i) = 1; + v15(i) = 1; + v16(i) = 1; + v17(i) = 1; + v18(i) = 1; + v19(i) = 1; + v20(i) = 1; + v21(i) = 1; + v22(i) = 1; + v23(i) = 1; + v24(i) = 1; + v25(i) = 1; + v26(i) = 1; + v27(i) = 1; + v28(i) = 1; + v29(i) = 1; + v30(i) = 1; + v31(i) = 1; + v32(i) = 1; + v33(i) = 1; + v34(i) = 1; + v35(i) = 1; + v36(i) = 1; + v37(i) = 1; + v38(i) = 1; + v39(i) = 1; + v40(i) = 1; + } + } + +// The ADD_COPY_CONSTRUCTOR macro is helpful to compare time in the copy +// constructor between compilers. We have found that the GNU compiler +// is sometimes able to inline the default copy constructor. +#ifdef ADD_COPY_CONSTRUCTOR + __attribute__((noinline)) ViewCollection(const ViewCollection& other) + : v1(other.v1), + v2(other.v2), + v3(other.v3), + v4(other.v4), + v5(other.v5), + v6(other.v6), + v7(other.v7), + v8(other.v8), + v9(other.v9), + v10(other.v10), + v11(other.v11), + v12(other.v12), + v13(other.v13), + v14(other.v14), + v15(other.v15), + v16(other.v16), + v17(other.v17), + v18(other.v18), + v19(other.v19), + v20(other.v20), + v21(other.v21), + v22(other.v22), + v23(other.v23), + v24(other.v24), + v25(other.v25), + v26(other.v26), + v27(other.v27), + v28(other.v28), + v29(other.v29), + v30(other.v30), + v31(other.v31), + v32(other.v32), + v33(other.v33), + v34(other.v34), + v35(other.v35), + v36(other.v36), + v37(other.v37), + v38(other.v38), + v39(other.v39), + v40(other.v40), + m_expected_sum(other.m_expected_sum), + m_side_effect(other.m_side_effect), + m_N(other.m_N) {} +#endif + + KOKKOS_INLINE_FUNCTION + double sum_views(int ii, bool execute_kernel) { + double result = 0.0; + if (execute_kernel) { + // This code is only executed when using the command line option -k + // The computation references all Kokkos views. This may help our + // effort to stop compilers from optimizing away the Kokkos views + for (int i = 0; i < m_N; ++i) { + result += v1(i) + v2(i) + v3(i) + v4(i) + v5(i) + v6(i) + v7(i) + + v8(i) + v9(i) + v10(i) + v11(i) + v12(i) + v13(i) + v14(i) + + v15(i) + v16(i) + v17(i) + v18(i) + v19(i) + v20(i) + v21(i) + + v22(i) + v23(i) + v24(i) + v25(i) + v26(i) + v27(i) + v28(i) + + v29(i) + v30(i) + v31(i) + v32(i) + v33(i) + v34(i) + v35(i) + + v36(i) + v37(i) + v38(i) + v39(i) + v40(i); + } + } else { + result = m_expected_sum; + } + // This statement introduces a side effect that may help our effort to + // stop compilers from optimizing away the temporary ViewCollection object + m_side_effect = result * (ii + 1); + return result; + } +}; + +void test_view_collection_kk(int N, int num_iter, bool execute_kernel) { + ViewCollection view_collection(N); + + Kokkos::Timer view_collection_timer; + double max_value = 0.0; + // Max Reduction boilerplate code taken from slide 53 of + // kokkos-tutorials/LectureSeries/KokkosTutorial_02_ViewsAndSpaces.pdf + Kokkos::parallel_reduce( + "collection-reduction", num_iter, + KOKKOS_LAMBDA(int i, double& valueToUpdate) { + // NOTE: The following lines expose the Kokkos View overheads + ViewCollection tmp_view_collection = view_collection; + double my_value = tmp_view_collection.sum_views(i, execute_kernel); + if (my_value > valueToUpdate) valueToUpdate = my_value; + }, + Kokkos::Max<double>(max_value)); + double view_collection_time = view_collection_timer.seconds(); + + bool success = std::fabs(max_value - N * NVIEWS) < 1.E-6; + std::cout << "View Time = " << view_collection_time << " seconds" + << std::endl; + if (success) { + std::cout << "Kokkos run:" << std::endl; + std::cout << "SUCCESS" << std::endl; + } else { + std::cout << "FAILURE" << std::endl; + } +} + +void test_view_collection_serial(int N, int num_iter, bool execute_kernel) { + ViewCollection view_collection(N); + + Kokkos::Timer view_collection_timer; + double max_value = 0.0; + // Max Reduction boilerplate code taken from slide 53 of + // kokkos-tutorials/LectureSeries/KokkosTutorial_02_ViewsAndSpaces.pdf + for (int i = 0; i < num_iter; ++i) { + // NOTE: The following lines expose the Kokkos View overheads + ViewCollection tmp_view_collection = view_collection; + double my_value = tmp_view_collection.sum_views(i, execute_kernel); + if (my_value > max_value) max_value = my_value; + } + double view_collection_time = view_collection_timer.seconds(); + + bool success = std::fabs(max_value - N * NVIEWS) < 1.E-6; + std::cout << "View Time 2 = " << view_collection_time << " seconds" + << std::endl; + if (success) { + std::cout << "Serial run:" << std::endl; + std::cout << "SUCCESS" << std::endl; + } else { + std::cout << "FAILURE" << std::endl; + } +} + +int main(int argc, char* argv[]) { + // The benchmark is only testing reference counting for views on host. +#if defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_SERIAL) || \ + defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_HPX) + int N = 1; + int num_iter = 1 << 27; + bool execute_kernel = false; + + for (int i = 0; i < argc; i++) { + if ((strcmp(argv[i], "-N") == 0)) { + N = atoi(argv[++i]); + if (N < 1) { + std::cout << "Array extent must be >= 1" << std::endl; + exit(1); + } + } else if (strcmp(argv[i], "-i") == 0) { + num_iter = atoi(argv[++i]); + if (num_iter < 1) { + std::cout << "Number of iterations must be >= 1" << std::endl; + exit(1); + } + } else if (strcmp(argv[i], "-k") == 0) { + execute_kernel = true; + } else if ((strcmp(argv[i], "-h") == 0)) { + printf(" Options:\n"); + printf(" -N <int>: Array extent\n"); + printf(" -i <int>: Number of iterations\n"); + printf(" -k: Execute the summation kernel\n"); + printf(" -h: Print this message\n\n"); + exit(1); + } + } + + std::cout << "Array extent = " << N << std::endl; + std::cout << "Iterations = " << num_iter << std::endl; + std::cout << "Execute summation kernel = " << std::boolalpha << execute_kernel + << std::noboolalpha << std::endl; + + // Test inside a Kokkos kernel. + Kokkos::initialize(argc, argv); + { test_view_collection_kk(N, num_iter, execute_kernel); } + + // Test outside Kokkos kernel. + test_view_collection_serial(N, num_iter, execute_kernel); + + Kokkos::finalize(); +#endif + + return 0; +} diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler index d1f8896f91b1d5b4b5210c21c26ca28cbcd45240..ee3c29e96d3ab43b3eeda6f9d9db79739c10f164 100755 --- a/packages/kokkos/bin/kokkos_launch_compiler +++ b/packages/kokkos/bin/kokkos_launch_compiler @@ -62,7 +62,7 @@ KOKKOS_COMPILER=${1} shift # store the expected C++ compiler -CXX_COMPILER=${1} +CXX_COMPILER=$(which "${1}") # remove the expected C++ compiler from the arguments shift @@ -84,7 +84,7 @@ shift # kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: # ${KOKKOS_COMPILER} -c file.cpp -o file.o -if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != $(which "${1}") ]]; then debug-message "$@" # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} exec "$@" diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper index c1400872402bae59fd97a698d9f8ed243a74372a..d58645f98ad6d292fbb6a2de62cc755eac5aed09 100755 --- a/packages/kokkos/bin/nvcc_wrapper +++ b/packages/kokkos/bin/nvcc_wrapper @@ -229,11 +229,11 @@ do fi ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|--fmad=*|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler) + --extended-lambda|--expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler|--disable-warnings) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake index 611c089b2e3feec2ec79228360f93c242fc055e2..2f70c2f038c1fc12bb8c2bc6fe15e1e54ca676f4 100644 --- a/packages/kokkos/cmake/Dependencies.cmake +++ b/packages/kokkos/cmake/Dependencies.cmake @@ -1,6 +1,3 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - TEST_OPTIONAL_TPLS CUSPARSE - ) +tribits_package_define_dependencies(LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib) -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) +tribits_tpl_tentatively_enable(DLlib) diff --git a/packages/kokkos/cmake/KokkosConfig.cmake.in b/packages/kokkos/cmake/KokkosConfig.cmake.in index e26c75b31224889d6d5806f8624d97933b6acf8e..1b6d1b66ff5d4035da8b7a014aac8ae66589cfe6 100644 --- a/packages/kokkos/cmake/KokkosConfig.cmake.in +++ b/packages/kokkos/cmake/KokkosConfig.cmake.in @@ -39,10 +39,12 @@ IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) GLOBAL CHECK_CUDA_COMPILES) -ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +ELSEIF(@Kokkos_ENABLE_CUDA@ + AND NOT @KOKKOS_COMPILE_LANGUAGE@ STREQUAL CUDA + AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) # - # if CUDA was enabled, separable compilation was not specified, and current compiler - # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # if CUDA was enabled, the compilation language was not set to CUDA, and separable compilation was not + # specified, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, # otherwise, the original command will be executed diff --git a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in index 8d5ef0de42f9440070e5772d31fbf9324be7e7a3..d3ac39ffa31a1114c1649b7ec67844b2b221158b 100644 --- a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -225,8 +225,13 @@ FUNCTION(kokkos_compilation) # if built w/o CUDA support, we want to basically make this a no-op SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) + + IF(CMAKE_VERSION VERSION_GREATER_EQUAL 3.17) + SET(MAYBE_CURRENT_INSTALLATION_ROOT "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../../..") + ENDIF() + # search relative first and then absolute - SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + SET(_HINTS "${MAYBE_CURRENT_INSTALLATION_ROOT}" "@CMAKE_INSTALL_PREFIX@") # find kokkos_launch_compiler FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index bec59ebd034939d3f819f2990a5c1494c517772b..44f81bb8cea2cac47cfd82dfdb9b6a4ec8253102 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -23,29 +23,31 @@ #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX -#cmakedefine KOKKOS_ENABLE_MEMKIND -#cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 #cmakedefine KOKKOS_ENABLE_CXX23 +#cmakedefine KOKKOS_ENABLE_CXX26 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_ENABLE_TUNING -#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS @@ -53,21 +55,22 @@ #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN +#cmakedefine KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY +#cmakedefine KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND +#cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC -#cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH -#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND #cmakedefine KOKKOS_ENABLE_ONEDPL +#cmakedefine KOKKOS_ENABLE_ROCTHRUST -#cmakedefine KOKKOS_ARCH_SSE42 #cmakedefine KOKKOS_ARCH_ARMV80 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2 +#cmakedefine KOKKOS_ARCH_ARMV9_GRACE #cmakedefine KOKKOS_ARCH_A64FX #cmakedefine KOKKOS_ARCH_AVX #cmakedefine KOKKOS_ARCH_AVX2 @@ -78,6 +81,8 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_RISCV_SG2042 +#cmakedefine KOKKOS_ARCH_RISCV_RVA22V #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 @@ -114,10 +119,13 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX906 #cmakedefine KOKKOS_ARCH_AMD_GFX908 #cmakedefine KOKKOS_ARCH_AMD_GFX90A +#cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 +#cmakedefine KOKKOS_ARCH_AMD_GFX942_APU #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 -#cmakedefine KOKKOS_ARCH_AMD_GPU +#cmakedefine KOKKOS_ARCH_AMD_GFX1103 +#cmakedefine KOKKOS_ARCH_AMD_GPU "@KOKKOS_ARCH_AMD_GPU@" #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated #cmakedefine KOKKOS_ARCH_VEGA908 // deprecated diff --git a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in deleted file mode 100644 index 626ef5a8ebefcaf7adcdeaa3b285f44892527dbc..0000000000000000000000000000000000000000 --- a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in +++ /dev/null @@ -1,17 +0,0 @@ -IF (NOT TARGET Kokkos::kokkos) - # Compute the installation prefix relative to this file. - get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - if(KOKKOS_IMPORT_PREFIX STREQUAL "/") - set(KOKKOS_IMPORT_PREFIX "") - endif() - add_library(Kokkos::kokkos INTERFACE IMPORTED) - set_target_properties(Kokkos::kokkos PROPERTIES - INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" - INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" - INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" - INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" - ) -ENDIF() diff --git a/packages/kokkos/cmake/Modules/CudaToolkit.cmake b/packages/kokkos/cmake/Modules/CudaToolkit.cmake index eda5541f7c0633a868285190e9a4c39c275adf6b..b8ac2048b5fce822778e6e6606ba65aca7b0639d 100644 --- a/packages/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/packages/kokkos/cmake/Modules/CudaToolkit.cmake @@ -483,38 +483,40 @@ endif() # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${CUDAToolkit_BIN_DIR} NO_DEFAULT_PATH - ) + ) endif() # Search using CUDAToolkit_ROOT -find_program(CUDAToolkit_NVCC_EXECUTABLE +find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ENV CUDA_PATH PATH_SUFFIXES bin ) # If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. -if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) +if(NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) # Declare error messages now, print later depending on find_package args. set(fail_base "Could not find nvcc executable in path specified by") set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") - if (CUDAToolkit_FIND_REQUIRED) - if (DEFINED CUDAToolkit_ROOT) + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) message(FATAL_ERROR ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(FATAL_ERROR ${env_cuda_root_fail}) endif() else() - if (NOT CUDAToolkit_FIND_QUIETLY) - if (DEFINED CUDAToolkit_ROOT) + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) message(STATUS ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(STATUS ${env_cuda_root_fail}) endif() endif() @@ -535,9 +537,9 @@ endif() # We will also search the default symlink location /usr/local/cuda first since # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked # directory is the desired location. -if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (UNIX) - if (NOT APPLE) +if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(UNIX) + if(NOT APPLE) set(platform_base "/usr/local/cuda-") else() set(platform_base "/Developer/NVIDIA/CUDA-") @@ -550,10 +552,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) file(GLOB possible_paths "${platform_base}*") # Iterate the glob results and create a descending list. set(possible_versions) - foreach (p ${possible_paths}) + foreach(p ${possible_paths}) # Extract version number from end of string string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) - if (IS_DIRECTORY ${p} AND p_version) + if(IS_DIRECTORY ${p} AND p_version) list(APPEND possible_versions ${p_version}) endif() endforeach() @@ -563,10 +565,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # every possible version of CUDA installed, this wouldn't create any # significant overhead. set(versions) - foreach (v ${possible_versions}) + foreach(v ${possible_versions}) list(LENGTH versions num_versions) # First version, nothing to compare with so just append. - if (num_versions EQUAL 0) + if(num_versions EQUAL 0) list(APPEND versions ${v}) else() # Loop through list. Insert at an index when comparison is @@ -574,9 +576,9 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # happen since this came from a glob list of directories. set(i 0) set(early_terminate FALSE) - while (i LESS num_versions) + while(i LESS num_versions) list(GET versions ${i} curr) - if (v VERSION_GREATER curr) + if(v VERSION_GREATER curr) list(INSERT versions ${i} ${v}) set(early_terminate TRUE) break() @@ -584,7 +586,7 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) math(EXPR i "${i} + 1") endwhile() # If it did not get inserted, place it at the end. - if (NOT early_terminate) + if(NOT early_terminate) list(APPEND versions ${v}) endif() endif() @@ -592,17 +594,18 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # With a descending list of versions, populate possible paths to search. set(search_paths) - foreach (v ${versions}) + foreach(v ${versions}) list(APPEND search_paths "${platform_base}${v}") endforeach() # Force the global default /usr/local/cuda to the front on Unix. - if (UNIX) + if(UNIX) list(INSERT search_paths 0 "/usr/local/cuda") endif() # Now search for nvcc again using the platform default search paths. - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${search_paths} PATH_SUFFIXES bin @@ -617,8 +620,8 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) unset(early_terminate) unset(search_paths) - if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (CUDAToolkit_FIND_REQUIRED) + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(CUDAToolkit_FIND_REQUIRED) message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") elseif(NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") @@ -636,8 +639,7 @@ if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) unset(cuda_dir) endif() -if(CUDAToolkit_NVCC_EXECUTABLE AND - CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) +if(CUDAToolkit_NVCC_EXECUTABLE AND CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value # This if statement will always match, but is used to provide variables for MATCH 1,2,3... if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) @@ -648,39 +650,38 @@ if(CUDAToolkit_NVCC_EXECUTABLE AND endif() else() # Compute the version by invoking nvcc - execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") - set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") endif() unset(NVCC_OUT) endif() - get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) # Handle cross compilation if(CMAKE_CROSSCOMPILING) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") # Support for NVPACK - set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") # Support for arm cross compilation set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Support for aarch64 cross compilation - if (ANDROID_ARCH_NAME STREQUAL "arm64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") else() set(CUDAToolkit_TARGET_NAME "aarch64-linux") - endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(CUDAToolkit_TARGET_NAME "x86_64-linux") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") endif() - if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") # add known CUDA target root path to the set of directories we search for programs, libraries and headers list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") @@ -702,25 +703,16 @@ else() set(_CUDAToolkit_Pop_Prefix True) endif() - # Find the include/ directory -find_path(CUDAToolkit_INCLUDE_DIR - NAMES cuda_runtime.h -) +find_path(CUDAToolkit_INCLUDE_DIR NAMES cuda_runtime.h) # And find the CUDA Runtime Library libcudart -find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64 lib/x64 -) -if (NOT CUDA_CUDART) - find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64/stubs lib/x64/stubs - ) +find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64 lib/x64) +if(NOT CUDA_CUDART) + find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64/stubs lib/x64/stubs) endif() -if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Unable to find cudart library.") endif() @@ -733,24 +725,17 @@ endif() #----------------------------------------------------------------------------- # Perform version comparison and validate all required variables are set. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(CUDAToolkit - REQUIRED_VARS - CUDAToolkit_INCLUDE_DIR - CUDA_CUDART - CUDAToolkit_NVCC_EXECUTABLE - VERSION_VAR - CUDAToolkit_VERSION +find_package_handle_standard_args( + CUDAToolkit REQUIRED_VARS CUDAToolkit_INCLUDE_DIR CUDA_CUDART CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR CUDAToolkit_VERSION ) -mark_as_advanced(CUDA_CUDART - CUDAToolkit_INCLUDE_DIR - CUDAToolkit_NVCC_EXECUTABLE - ) +mark_as_advanced(CUDA_CUDART CUDAToolkit_INCLUDE_DIR CUDAToolkit_NVCC_EXECUTABLE) #----------------------------------------------------------------------------- # Construct result variables if(CUDAToolkit_FOUND) - set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) - get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) endif() #----------------------------------------------------------------------------- @@ -762,27 +747,26 @@ if(CUDAToolkit_FOUND) set(search_names ${lib_name} ${arg_ALT}) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH - PATH_SUFFIXES nvidia/current lib64 lib/x64 lib - ${arg_EXTRA_PATH_SUFFIXES} + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib ${arg_EXTRA_PATH_SUFFIXES} ) # Don't try any stub directories intil we have exhausted all other # search locations. if(NOT CUDA_${lib_name}_LIBRARY) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs ) endif() mark_as_advanced(CUDA_${lib_name}_LIBRARY) - if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") @@ -800,16 +784,15 @@ if(CUDAToolkit_FOUND) target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() - _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + _cudatoolkit_find_and_add_import_lib(cuda_driver ALT cuda) - _CUDAToolkit_find_and_add_import_lib(cudart) - _CUDAToolkit_find_and_add_import_lib(cudart_static) + _cudatoolkit_find_and_add_import_lib(cudart) + _cudatoolkit_find_and_add_import_lib(cudart_static) # setup dependencies that are required for cudart_static when building # on linux. These are generally only required when using the CUDA toolkit # when CUDA language is disabled - if(NOT TARGET CUDA::cudart_static_deps - AND TARGET CUDA::cudart_static) + if(NOT TARGET CUDA::cudart_static_deps AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) @@ -831,55 +814,64 @@ if(CUDAToolkit_FOUND) endif() endif() - _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library - foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + _cudatoolkit_find_and_add_import_lib(culibos) # it's a static library + foreach(cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) endforeach() # cuFFTW depends on cuFFT - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft_static) # cuSOLVER depends on cuBLAS, and cuSPARSE - _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) - _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + _cudatoolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _cudatoolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) # nvGRAPH depends on cuRAND, and cuSOLVER. - _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) - _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + _cudatoolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _cudatoolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) # Process the majority of the NPP libraries. - foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + foreach( + cuda_lib + nppial + nppicc + nppidei + nppif + nppig + nppim + nppist + nppitc + npps + nppicom + nppisu + ) + _cudatoolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) endforeach() - _CUDAToolkit_find_and_add_import_lib(cupti - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(cupti_static - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti_static EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + _cudatoolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) - _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _cudatoolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) if(WIN32) # nvtools can be installed outside the CUDA toolkit directory # so prefer the NVTOOLSEXT_PATH windows only environment variable # In addition on windows the most common name is nvToolsExt64_1 - find_library(CUDA_nvToolsExt_LIBRARY + find_library( + CUDA_nvToolsExt_LIBRARY NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt - PATHS ENV NVTOOLSEXT_PATH - ENV CUDA_PATH + PATHS ENV NVTOOLSEXT_PATH ENV CUDA_PATH PATH_SUFFIXES lib/x64 lib ) endif() - _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + _cudatoolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) - _CUDAToolkit_find_and_add_import_lib(OpenCL) + _cudatoolkit_find_and_add_import_lib(OpenCL) endif() if(_CUDAToolkit_Pop_ROOT_PATH) diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake index 792c92c07e9d2e121ac1504a658057fcd685d109..3a6a826197ec784242b6758082a066748ca24017 100644 --- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -1,42 +1,40 @@ -IF (NOT CUDAToolkit_ROOT) - IF (NOT CUDA_ROOT) - SET(CUDA_ROOT $ENV{CUDA_ROOT}) - ENDIF() - IF(CUDA_ROOT) - SET(CUDAToolkit_ROOT ${CUDA_ROOT}) - ENDIF() -ENDIF() +if(NOT CUDAToolkit_ROOT) + if(NOT CUDA_ROOT) + set(CUDA_ROOT $ENV{CUDA_ROOT}) + endif() + if(CUDA_ROOT) + set(CUDAToolkit_ROOT ${CUDA_ROOT}) + endif() +endif() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") - find_package(CUDAToolkit) -ELSE() - include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + message(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +endif() +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") + find_package(CUDAToolkit REQUIRED) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + kokkos_export_cmake_tpl(CUDAToolkit REQUIRED) +else() + include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) -IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) -ELSE() - SET(FOUND_CUDART FALSE) -ENDIF() + if(TARGET CUDA::cudart) + set(FOUND_CUDART TRUE) + kokkos_export_imported_tpl(CUDA::cudart) + else() + set(FOUND_CUDART FALSE) + endif() -IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) -ELSE() - SET(FOUND_CUDA_DRIVER FALSE) -ENDIF() + if(TARGET CUDA::cuda_driver) + set(FOUND_CUDA_DRIVER TRUE) + kokkos_export_imported_tpl(CUDA::cuda_driver) + else() + set(FOUND_CUDA_DRIVER FALSE) + endif() -include(FindPackageHandleStandardArgs) -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ELSE() - SET(KOKKOS_CUDA_ERROR DEFAULT_MSG) -ENDIF() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER) -IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) -ENDIF() + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + if(FOUND_CUDA_DRIVER AND FOUND_CUDART) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + endif() +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake index d7b54fb9c9ab79d810adbebb84f1a5bf2104c351..e3c199b7c5de48d98421758f0339136f712660aa 100644 --- a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -1,15 +1,10 @@ - -FIND_PACKAGE(HPX REQUIRED 1.8.0) +find_package(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target -KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE - LINK_LIBRARIES ${HPX_LIBRARIES} - INCLUDES ${HPX_INCLUDE_DIRS} -) +kokkos_create_imported_tpl(HPX INTERFACE LINK_LIBRARIES ${HPX_LIBRARIES} INCLUDES ${HPX_INCLUDE_DIRS}) #this is a bit funky since this is a CMake target #but HPX doesn't export itself correctly -KOKKOS_EXPORT_CMAKE_TPL(HPX) +kokkos_export_cmake_tpl(HPX) #I would prefer all of this gets replaced with #KOKKOS_IMPORT_CMAKE_TPL(HPX) - diff --git a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake index cf763b7e5bb585ed77e8dc1fb3b015566a0326f9..77ce8c71f730568e3c605f52c25f753d41abfe49 100644 --- a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) +kokkos_find_imported(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake index 8adcdcdbb8e3f02d3959c8605f1470f57425152c..85ae0b82244ca042447ef844261d5683d2791481 100644 --- a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) +kokkos_find_imported(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake index 70e0d6c454ad86bf9262495f9658253d79aa42ef..ce428b0aeec65f0d40f8df5464acc9ce09cf587c 100644 --- a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -2,17 +2,19 @@ # (which would not be contained in CMake's search paths anyway). # Hence, try if the compiler supports libquadmath natively first before doing # the standard package search. -SET(CMAKE_REQUIRED_LIBRARIES "quadmath") -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +set(CMAKE_REQUIRED_LIBRARIES "quadmath") +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include <quadmath.h> int main(void){ __float128 foo = ::sqrtq(123.456); return foo; }" - KOKKOS_QUADMATH_COMPILER_SUPPORT) -IF (KOKKOS_QUADMATH_COMPILER_SUPPORT) - KOKKOS_CREATE_IMPORTED_TPL(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) -ELSE() - KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) -ENDIF() + KOKKOS_QUADMATH_COMPILER_SUPPORT +) +if(KOKKOS_QUADMATH_COMPILER_SUPPORT) + kokkos_create_imported_tpl(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) +else() + kokkos_find_imported(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake deleted file mode 100644 index e75da56b5b5324050236ee0ee4c6847452d5b3cf..0000000000000000000000000000000000000000 --- a/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h LIBRARY rt) diff --git a/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake b/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake deleted file mode 100644 index 20aaff22955ce9ad026c51b870bf04b7d8b0df42..0000000000000000000000000000000000000000 --- a/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind) diff --git a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 01791cff443c83d8f3d4887fc664ebce8780fbd8..68de942a69835544951df0f7dca64c950873fe5f 100644 --- a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -1,9 +1,10 @@ -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) +include(CheckIncludeFileCXX) +check_include_file_cxx(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +check_include_file_cxx(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include <iostream> int main() @@ -13,34 +14,40 @@ CHECK_CXX_SOURCE_COMPILES(" #endif return 0; }" - KOKKOS_NO_TBB_CONFLICT) + KOKKOS_NO_TBB_CONFLICT +) -IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE +if(KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ELSE() - FIND_PACKAGE(oneDPL REQUIRED) + endif() +else() + find_package(oneDPL REQUIRED) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE LINK_LIBRARIES oneDPL) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE + LINK_LIBRARIES + oneDPL # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ENDIF() + endif() + + # Export oneDPL as a Kokkos dependency + kokkos_export_cmake_tpl(oneDPL) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake index f796737f5b29cdbffd4eadf2b23c7321d7a607fb..9673af0b9d901dbc7d3304b942a1ee6d91e9c4f9 100644 --- a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,7 +1,7 @@ include(FindPackageHandleStandardArgs) -FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) # FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt. # We keep the code as is for now because it is hard to find the version of ROCM @@ -16,18 +16,24 @@ execute_process( COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE CLANG_RT_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE CLANG_RT_CHECK) + RESULT_VARIABLE CLANG_RT_CHECK +) -if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" ) +if(NOT "${CLANG_RT_CHECK}" STREQUAL "0") # if the above failed, we delete CLANG_RT_LIBRARY to make the args check # below fail unset(CLANG_RT_LIBRARY) endif() - find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY) -kokkos_create_imported_tpl(ROCM INTERFACE - LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} ${CLANG_RT_LIBRARY} - COMPILE_DEFINITIONS __HIP_ROCclr__ +kokkos_create_imported_tpl( + ROCM + INTERFACE + LINK_LIBRARIES + ${HSA_RUNTIME_LIBRARY} + ${AMD_HIP_LIBRARY} + ${CLANG_RT_LIBRARY} + COMPILE_DEFINITIONS + __HIP_ROCclr__ ) diff --git a/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake new file mode 100644 index 0000000000000000000000000000000000000000..b4b905795dd02da78dd000b3dfe103b5cc2524d6 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -0,0 +1,15 @@ +# ROCm 5.6 and earlier set AMDGPU_TARGETS and GPU_TARGETS to all the supported +# architectures. Therefore, we end up compiling Kokkos for all the supported +# architecture. Starting with ROCm 5.7 AMDGPU_TARGETS and GPU_TARGETS are empty. +# It is the user's job to set the variables. Since we are injecting the +# architecture flag ourselves, we can let the variables empty. To replicate the +# behavior of ROCm 5.7 and later for earlier version of ROCm we set +# AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If +# the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. +set(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +set(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +find_package(rocthrust REQUIRED) +kokkos_create_imported_tpl(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) + +# Export ROCTHRUST as a Kokkos dependency +kokkos_export_cmake_tpl(rocthrust) diff --git a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake index ff0db5123f8e982806d92f62dae9da82413ba6ec..280b8641da150c9fe3ed067f83b56c3289cdd0ca 100644 --- a/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLTHREADS.cmake @@ -1,15 +1,14 @@ -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE(Threads) +include(FindPackageHandleStandardArgs) +find_package(Threads) -IF (TARGET Threads::Threads) - SET(FOUND_THREADS TRUE) -ELSE() - SET(FOUND_THREADS FALSE) -ENDIF() +if(TARGET Threads::Threads) + set(FOUND_THREADS TRUE) +else() + set(FOUND_THREADS FALSE) +endif() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLTHREADS DEFAULT_MSG FOUND_THREADS) +find_package_handle_standard_args(TPLTHREADS DEFAULT_MSG FOUND_THREADS) #Only create the TPL if we succeed -IF (FOUND_THREADS) - KOKKOS_CREATE_IMPORTED_TPL(THREADS INTERFACE LINK_OPTIONS - ${CMAKE_THREAD_LIBS_INIT}) -ENDIF() +if(FOUND_THREADS) + kokkos_create_imported_tpl(THREADS INTERFACE LINK_OPTIONS ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/packages/kokkos/cmake/README.md b/packages/kokkos/cmake/README.md index 385bbfcd5d5a0a66bcfb893af7953ba555830405..0548e89a90e767beda2d9cf4b260f59d15e1a940 100644 --- a/packages/kokkos/cmake/README.md +++ b/packages/kokkos/cmake/README.md @@ -310,20 +310,6 @@ When Kokkos is loaded by a downstream project, this TPL must be loaded. Calling this function simply appends text recording the location where the TPL was found and adding a `find_dependency(...)` call that will reload the CMake target. -### The Great TriBITS Compromise - -TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. -TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. - -Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. -Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. -At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. - -Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. -If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. -If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. -For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. - ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) [](https://opensource.org/licenses/BSD-3-Clause) diff --git a/packages/kokkos/cmake/build_env_info.cmake b/packages/kokkos/cmake/build_env_info.cmake index 0eeb6372455bd0e0ff6ece535d2724b3648e936c..76afbb74b63cb48882c9a8cae9c9fd12e0854ed4 100644 --- a/packages/kokkos/cmake/build_env_info.cmake +++ b/packages/kokkos/cmake/build_env_info.cmake @@ -2,121 +2,118 @@ find_package(Git QUIET) -SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +set(post_configure_dir ${CMAKE_CURRENT_BINARY_DIR}/generated) -SET(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) +set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) +set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) -FUNCTION(check_git_write git_hash git_clean_status) - FILE( - WRITE - ${CMAKE_BINARY_DIR}/git-state.txt - "${git_hash}-${git_clean_status}") -ENDFUNCTION() +function(check_git_write git_hash git_clean_status) + file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt "${git_hash}-${git_clean_status}") +endfunction() -FUNCTION(check_git_read git_hash) - IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) - FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) - LIST(GET CONTENT 0 var) +function(check_git_read git_hash) + if(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + file(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + list(GET CONTENT 0 var) message(DEBUG "Cached Git hash: ${var}") - SET(${git_hash} ${var} PARENT_SCOPE) + set(${git_hash} ${var} PARENT_SCOPE) else() - SET(${git_hash} "INVALID" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() - - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) + set(${git_hash} "INVALID" PARENT_SCOPE) + endif() +endfunction() + +function(check_git_version) + if(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) + file(COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp DESTINATION ${post_configure_dir}) + endif() + + if(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() - ENDIF() + endif() # Get the current working branch execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit description execute_process( COMMAND ${GIT_EXECUTABLE} show -s --format=%s WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit date execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DATE - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Check if repo is dirty / clean execute_process( COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} RESULT_VARIABLE IS_DIRTY - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - IF(IS_DIRTY EQUAL 0) - SET(GIT_CLEAN_STATUS "CLEAN") + if(IS_DIRTY EQUAL 0) + set(GIT_CLEAN_STATUS "CLEAN") else() - SET(GIT_CLEAN_STATUS "DIRTY") - ENDIF() + set(GIT_CLEAN_STATUS "DIRTY") + endif() # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%h WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) + if(NOT EXISTS ${post_configure_dir}) file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() + endif() # Only update the git_version.cpp if the hash has changed. This will # prevent us from rebuilding the project more than we need to. - IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} - OR NOT EXISTS ${post_configure_file}) + if(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) # Set the GIT_HASH_CACHE variable so the next build won't have # to regenerate the source file. check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) message(STATUS "Configured git information in ${post_configure_file}") - ENDIF() -ENDFUNCTION() + endif() +endfunction() -FUNCTION(check_git_setup) +function(check_git_setup) add_custom_target( - AlwaysCheckGit COMMAND ${CMAKE_COMMAND} - -DRUN_CHECK_GIT_VERSION=1 - -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/build_env_info.cmake - BYPRODUCTS ${post_configure_file}) + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} -P + ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} + ) - add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) + add_library(impl_git_version ${CMAKE_CURRENT_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() -ENDFUNCTION() +endfunction() # This is used to run this function from an external cmake process. -IF(RUN_CHECK_GIT_VERSION) +if(RUN_CHECK_GIT_VERSION) check_git_version() -ENDIF() +endif() diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp b/packages/kokkos/cmake/compile_tests/amd_apu.cc similarity index 57% rename from packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp rename to packages/kokkos/cmake/compile_tests/amd_apu.cc index 272036396905722ab04982c7526d229da40a79a0..a9c1edbd57b0348f0cab8eb494bb5a1602816444 100644 --- a/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp +++ b/packages/kokkos/cmake/compile_tests/amd_apu.cc @@ -14,5 +14,25 @@ // //@HEADER -#include <TestCuda_Category.hpp> -#include <TestGraph.hpp> +#include <iostream> +#include <hip/hip_runtime_api.h> + +int main() { + hipDeviceProp_t hipProp; + hipError_t error = hipGetDeviceProperties(&hipProp, 0); + + if (error != hipSuccess) { + std::cout << hipGetErrorString(error) << '\n'; + return error; + } + + if (hipProp.integrated == 1) { + // We detected an APU + std::cout << "ON"; + } else { + // We detected a discrete GPU + std::cout << "OFF"; + } + + return 0; +} diff --git a/packages/kokkos/cmake/cray.cmake b/packages/kokkos/cmake/cray.cmake index 08912f5130f92fec97a4bdb6abb90e860d0b9cda..4ce5352bda265763614dbf4597757278a7a71428 100644 --- a/packages/kokkos/cmake/cray.cmake +++ b/packages/kokkos/cmake/cray.cmake @@ -1,9 +1,6 @@ - - function(kokkos_set_cray_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/packages/kokkos/cmake/deps/CUDA.cmake b/packages/kokkos/cmake/deps/CUDA.cmake index 68bf5b3d5798266c784ee7586751a5ae9d24cd0e..49eaf883a46f699d0a9c83dbbd64e3c5d1365640 100644 --- a/packages/kokkos/cmake/deps/CUDA.cmake +++ b/packages/kokkos/cmake/deps/CUDA.cmake @@ -17,25 +17,24 @@ # Check for CUDA support -SET(_CUDA_FAILURE OFF) +set(_CUDA_FAILURE OFF) # Have CMake find CUDA -IF(NOT _CUDA_FAILURE) - FIND_PACKAGE(CUDA 3.2) - IF (NOT CUDA_FOUND) - SET(_CUDA_FAILURE ON) - ENDIF() -ENDIF() +if(NOT _CUDA_FAILURE) + find_package(CUDA 3.2) + if(NOT CUDA_FOUND) + set(_CUDA_FAILURE ON) + endif() +endif() -IF(NOT _CUDA_FAILURE) +if(NOT _CUDA_FAILURE) # if we haven't met failure macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) - TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + tribits_add_library(${cuda_target} ${ARGN} CUDALIBRARY) endmacro() - GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -ELSE() - SET(TPL_ENABLE_CUDA OFF) -ENDIF() + global_set(TPL_CUDA_LIBRARY_DIRS) + global_set(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + global_set(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) +else() + set(TPL_ENABLE_CUDA OFF) +endif() diff --git a/packages/kokkos/cmake/deps/CUSPARSE.cmake b/packages/kokkos/cmake/deps/CUSPARSE.cmake deleted file mode 100644 index b016971ab915021a31981d931e53fff9e483ce00..0000000000000000000000000000000000000000 --- a/packages/kokkos/cmake/deps/CUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ************************************************************************ -# @HEADER - -#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) - -#IF (TPL_ENABLE_CUDA) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) -# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) -# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) -#ENDIF() - diff --git a/packages/kokkos/cmake/deps/HWLOC.cmake b/packages/kokkos/cmake/deps/HWLOC.cmake index 77d5a9b83a649eea2204ae7a3a6be6c4207c27ef..52d8368d0419c1ab335f2bc729f4b17fa7da2ecd 100644 --- a/packages/kokkos/cmake/deps/HWLOC.cmake +++ b/packages/kokkos/cmake/deps/HWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,7 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/packages/kokkos/cmake/deps/Pthread.cmake b/packages/kokkos/cmake/deps/Pthread.cmake index e879bff3741db50ec13b2d71f89d1212015550ff..b811f850841d95b67d463baf0d37f646a0fe7406 100644 --- a/packages/kokkos/cmake/deps/Pthread.cmake +++ b/packages/kokkos/cmake/deps/Pthread.cmake @@ -15,31 +15,27 @@ # ************************************************************************ # @HEADER +set(USE_THREADS FALSE) -SET(USE_THREADS FALSE) - -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") + kokkos_create_imported_tpl_library(Pthread) +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/packages/kokkos/cmake/deps/quadmath.cmake b/packages/kokkos/cmake/deps/quadmath.cmake index 6aef08e8812fe23de9cc9802a3f7c46730b4c60f..9006d0cb9efb152a619e09af236c6b877a7feae4 100644 --- a/packages/kokkos/cmake/deps/quadmath.cmake +++ b/packages/kokkos/cmake/deps/quadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +kokkos_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake index 4c5331ec793b28b9d6e1343ae2b0d746dd785242..d3fe1e6e2f628a7c04b53788971797baef138a55 100644 --- a/packages/kokkos/cmake/fake_tribits.cmake +++ b/packages/kokkos/cmake/fake_tribits.cmake @@ -1,296 +1,213 @@ #These are tribits wrappers used by all projects in the Kokkos ecosystem -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) +include(CMakeParseArguments) +include(CTest) -FUNCTION(ASSERT_DEFINED VARS) - FOREACH(VAR ${VARS}) - IF(NOT DEFINED ${VAR}) - MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -IF(NOT KOKKOS_HAS_TRILINOS) -MACRO(APPEND_GLOB VAR) - FILE(GLOB LOCAL_TMP_VAR ${ARGN}) - LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) -ENDMACRO() - -MACRO(GLOBAL_SET VARNAME) - SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) -ENDMACRO() - -MACRO(PREPEND_GLOBAL_SET VARNAME) - ASSERT_DEFINED(${VARNAME}) - GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) -ENDMACRO() -ENDIF() - -MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") - ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) - SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) -ENDMACRO() - -FUNCTION(KOKKOS_ADD_TEST) - if (KOKKOS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "SKIP_TRIBITS" - "EXE;NAME;TOOL" - "ARGS" - ${ARGN}) - - IF(TEST_SKIP_TRIBITS) - MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") - RETURN() - ENDIF() - - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - COMM serial mpi - NUM_MPI_PROCS 1 - ARGS ${TEST_ARGS} - ${TEST_UNPARSED_ARGUMENTS} - ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED - ) - - # We will get prepended package name here - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - - # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults - # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, - # the test won't actually be added and attempting to set a property on it below - # will yield an error. - if(TARGET ${EXE}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_TOOLS_LIBS=$<TARGET_FILE:${TEST_TOOL}>") - endforeach() - endif() +function(ASSERT_DEFINED VARS) + foreach(VAR ${VARS}) + if(NOT DEFINED ${VAR}) + message(SEND_ERROR "Error, the variable ${VAR} is not defined!") endif() + endforeach() +endfunction() + +macro(APPEND_GLOB VAR) + file(GLOB LOCAL_TMP_VAR ${ARGN}) + list(APPEND ${VAR} ${LOCAL_TMP_VAR}) +endmacro() + +macro(GLOBAL_SET VARNAME) + set(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +endmacro() + +macro(PREPEND_GLOBAL_SET VARNAME) + assert_defined(${VARNAME}) + global_set(${VARNAME} ${ARGN} ${${VARNAME}}) +endmacro() + +macro(ADD_INTERFACE_LIBRARY LIB_NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + add_library(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + set_target_properties(${LIB_NAME} PROPERTIES INTERFACE TRUE) +endmacro() + +function(KOKKOS_ADD_TEST) + cmake_parse_arguments( + TEST "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" + ${ARGN} + ) + # To match Tribits, we should always be receiving + # the root names of exes/libs + if(TEST_EXE) + set(EXE_ROOT ${TEST_EXE}) else() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL;SKIP_TRIBITS" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;ARGS" - ${ARGN}) - # To match Tribits, we should always be receiving - # the root names of exes/libs - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - # Prepend package name to the test name - # These should be the full target name - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_TOOL) - ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>") - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - ENDIF() -ENDFUNCTION() + set(EXE_ROOT ${TEST_NAME}) + endif() + # Prepend package name to the test name + # These should be the full target name + set(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) + + # For compatibility with Trilinos testing, we support: + # * `-D <fullTestName>_DISABLE=ON` + # * `-D <fullTestName>_EXTRA_ARGS="<arg0>;<arg1>;<arg2>;..."` + # * `-D <fullTestName>_SET_RUN_SERIAL=ON` + if(${TEST_NAME}_DISABLE) + return() + endif() -FUNCTION(KOKKOS_ADD_ADVANCED_TEST) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_ADVANCED_TEST(${ARGN}) + set(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + if(WIN32) + add_test(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} + ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS} + ) else() - # TODO Write this + add_test(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS}) endif() -ENDFUNCTION() - -MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) - ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) - TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) - TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) -ENDMACRO() + # Trilinos testing benefits from labeling the tests as "Kokkos" tests + set_tests_properties(${TEST_NAME} PROPERTIES LABELS Kokkos) + if(${TEST_NAME}_SET_RUN_SERIAL) + set_tests_properties(${TEST_NAME} PROPERTIES RUN_SERIAL ON) + endif() + # TriBITS doesn't actually currently support `-D <fullTestName>_ENVIRONMENT` + # but we decided to add it anyway + if(${TEST_NAME}_ENVIRONMENT) + set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT "${${TEST_NAME}_ENVIRONMENT}") + endif() + if(TEST_WILL_FAIL) + set_tests_properties(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + endif() + if(TEST_FAIL_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + endif() + if(TEST_PASS_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + endif() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property( + TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>" + ) + endif() + verify_empty(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +endfunction() + +macro(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + add_interface_library(TPL_LIB_${TPL_NAME}) + target_link_libraries(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + target_include_directories(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +endmacro() + +function(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + cmake_parse_arguments(PARSE "" "" "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" ${ARGN}) + + set(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + if(PARSE_REQUIRED_LIBS_NAMES) + find_library(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + if(NOT TPL_${TPL_NAME}_LIBRARIES) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(PARSE_REQUIRED_HEADERS) + find_path(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + if(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(_${TPL_NAME}_ENABLE_SUCCESS) + kokkos_create_imported_tpl_library(${TPL_NAME}) + endif() + verify_empty(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() -FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) +function(KOKKOS_LIB_TYPE LIB RET) + get_target_property(PROP ${LIB} TYPE) + if(${PROP} STREQUAL "INTERFACE_LIBRARY") + set(${RET} "INTERFACE" PARENT_SCOPE) else() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" - ${ARGN}) - - SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) - IF (PARSE_REQUIRED_LIBS_NAMES) - FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) - IF(NOT TPL_${TPL_NAME}_LIBRARIES) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (PARSE_REQUIRED_HEADERS) - FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) - IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (_${TPL_NAME}_ENABLE_SUCCESS) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) - ENDIF() - VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) + set(${RET} "PUBLIC" PARENT_SCOPE) endif() -ENDFUNCTION() - -MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) -if(KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -else() - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -endif() -ENDMACRO() - -FUNCTION(KOKKOS_LIB_TYPE LIB RET) -GET_TARGET_PROPERTY(PROP ${LIB} TYPE) -IF (${PROP} STREQUAL "INTERFACE_LIBRARY") - SET(${RET} "INTERFACE" PARENT_SCOPE) -ELSE() - SET(${RET} "PUBLIC" PARENT_SCOPE) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) -IF(KOKKOS_HAS_TRILINOS) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - #don't trust tribits to do this correctly - but need to add package name - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSEIF(TARGET ${TARGET}) - #the target actually exists - this means we are doing separate libs - #or this a test library - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSE() - GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - IF (${TARGET} IN_LIST LIBS) - SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) - ELSE() - MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") - ENDIF() -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) -IF(KOKKOS_HAS_TRILINOS) - #do nothing -ELSE() - SET(options INTERFACE) - SET(oneValueArgs) - SET(multiValueArgs) - CMAKE_PARSE_ARGUMENTS(PARSE - "INTERFACE" - "" - "" - ${ARGN}) - SET(LINK_TYPE) - IF(PARSE_INTERFACE) - SET(LINK_TYPE INTERFACE) - ELSE() - SET(LINK_TYPE PUBLIC) - ENDIF() - TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) - VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) -ELSE() - SET(oneValueArgs) - SET(multiValueArgs HEADERS SOURCES) - - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES;DEPLIBS" - ${ARGN}) - - SET(LIB_TYPE) - IF (PARSE_STATIC) - SET(LIB_TYPE STATIC) - ELSEIF (PARSE_SHARED) - SET(LIB_TYPE SHARED) - ENDIF() - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) - IF (PARSE_DEPLIBS) - TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) - ENDIF() -ENDIF() -ENDFUNCTION() +endfunction() + +function(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) + if(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + kokkos_lib_type(${TARGET} INCTYPE) + target_include_directories(${TARGET} ${INCTYPE} ${ARGN}) + else() + get_property(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + if(${TARGET} IN_LIST LIBS) + set_property(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + else() + message(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + endif() + endif() +endfunction() + +function(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) + set(options INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(PARSE "INTERFACE" "" "" ${ARGN}) + set(LINK_TYPE) + if(PARSE_INTERFACE) + set(LINK_TYPE INTERFACE) + else() + set(LINK_TYPE PUBLIC) + endif() + target_link_libraries(${TARGET} ${LINK_TYPE} ${DEPLIB}) + verify_empty(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() +function(KOKKOS_ADD_TEST_LIBRARY NAME) + set(oneValueArgs) + set(multiValueArgs HEADERS SOURCES) -FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) -IF(KOKKOS_HAS_TRILINOS) - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS( - INC - "REQUIRED_DURING_INSTALLATION_TESTING" - "" - "" - ${ARGN} - ) - INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES;DEPLIBS" ${ARGN}) + set(LIB_TYPE) + if(PARSE_STATIC) + set(LIB_TYPE STATIC) + elseif(PARSE_SHARED) + set(LIB_TYPE SHARED) + endif() -MACRO(PRINTALL match) -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - if("${_variableName}" MATCHES "${match}") - message(STATUS "${_variableName}=${${_variableName}}") + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) endif() -endforeach() -ENDMACRO() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + add_library(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + if(PARSE_DEPLIBS) + target_link_libraries(${NAME} PRIVATE ${PARSE_DEPLIBS}) + endif() +endfunction() + +function(KOKKOS_INCLUDE_DIRECTORIES) + cmake_parse_arguments(INC "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + include_directories(${INC_UNPARSED_ARGUMENTS}) +endfunction() + +macro(PRINTALL match) + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() + endforeach() +endmacro() -MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) - STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDMACRO() +macro(SET_GLOBAL_REPLACE SUBSTR VARNAME) + string(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + global_set(${VARNAME} ${TEMP}) +endmacro() -FUNCTION(GLOBAL_APPEND VARNAME) +function(GLOBAL_APPEND VARNAME) #We make this a function since we are setting variables #and want to use scope to avoid overwriting local variables - SET(TEMP ${${VARNAME}}) - LIST(APPEND TEMP ${ARGN}) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDFUNCTION() + set(TEMP ${${VARNAME}}) + list(APPEND TEMP ${ARGN}) + global_set(${VARNAME} ${TEMP}) +endfunction() diff --git a/packages/kokkos/cmake/gnu.cmake b/packages/kokkos/cmake/gnu.cmake index aa11fe87b111970ea440a3765c06d0b31b402d15..e53b4a7becdd04eb33880fba038ad975e728b99c 100644 --- a/packages/kokkos/cmake/gnu.cmake +++ b/packages/kokkos/cmake/gnu.cmake @@ -1,23 +1,21 @@ - -FUNCTION(kokkos_set_gnu_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_gnu_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + endif() +endfunction() diff --git a/packages/kokkos/cmake/intel.cmake b/packages/kokkos/cmake/intel.cmake index 7e6ee3358c90940195b7b8dd589f1fa500ad063f..b7752caabdf86f3ed6d36b09dbb1f30b2f032a22 100644 --- a/packages/kokkos/cmake/intel.cmake +++ b/packages/kokkos/cmake/intel.cmake @@ -1,18 +1,15 @@ - -FUNCTION(kokkos_set_intel_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_intel_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) -ENDFUNCTION() - - + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + set(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index bccf674d7633a01cc896f97d4d26f61097e930cb..ae45da806f73dff5580bc44f907802a531104f5e 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -1,1132 +1,1376 @@ - -FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) +function(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) #all optimizations off by default - KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) - SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - IF(KOKKOS_ARCH_${SUFFIX}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) - SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - + kokkos_dependent_option(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) + set(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + if(KOKKOS_ARCH_${SUFFIX}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + set(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + endif() +endfunction() # Make sure devices and compiler ID are done -KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) -KOKKOS_CFG_DEPENDS(ARCH DEVICES) -KOKKOS_CFG_DEPENDS(ARCH OPTIONS) +kokkos_cfg_depends(ARCH COMPILER_ID) +kokkos_cfg_depends(ARCH DEVICES) +kokkos_cfg_depends(ARCH OPTIONS) -KOKKOS_CHECK_DEPRECATED_OPTIONS( - ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" - ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +kokkos_check_deprecated_options( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" ARCH_RYZEN + "Please replace RYZEN with ZEN or ZEN2, depending on your platform" ) #------------------------------------------------------------------------------- # List of possible host architectures. #------------------------------------------------------------------------------- -SET(KOKKOS_ARCH_LIST) - - -KOKKOS_DEPRECATED_LIST(ARCH ARCH) - -SET(HOST_ARCH_ALREADY_SPECIFIED "") -MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) - KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE) - IF(KOKKOS_ARCH_${ARCH}) - IF(HOST_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) - ENDIF() -ENDMACRO() - -DECLARE_AND_CHECK_HOST_ARCH(NATIVE "local machine") -DECLARE_AND_CHECK_HOST_ARCH(AMDAVX "AMD chip") -DECLARE_AND_CHECK_HOST_ARCH(ARMV80 "ARMv8.0 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") -DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(WSM "Intel Westmere CPU") -DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") -DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") -DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ICL "Intel Ice Lake Client CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(SKL "Intel Skylake Client CPUs") -DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(BGQ "IBM Blue Gene Q") -DECLARE_AND_CHECK_HOST_ARCH(POWER7 "IBM POWER7 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") - -IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_CUDA_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") - -IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -ENDIF() +set(KOKKOS_ARCH_LIST) + +include(CheckCXXCompilerFlag) + +kokkos_deprecated_list(ARCH ARCH) + +set(HOST_ARCH_ALREADY_SPECIFIED "") +macro(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) + kokkos_arch_option(${ARCH} HOST "${LABEL}" TRUE) + if(KOKKOS_ARCH_${ARCH}) + if(HOST_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) + endif() +endmacro() + +declare_and_check_host_arch(NATIVE "local machine") +declare_and_check_host_arch(AMDAVX "AMD chip") +declare_and_check_host_arch(ARMV80 "ARMv8.0 Compatible CPU") +declare_and_check_host_arch(ARMV81 "ARMv8.1 Compatible CPU") +declare_and_check_host_arch(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") +declare_and_check_host_arch(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") +declare_and_check_host_arch(A64FX "ARMv8.2 with SVE Support") +declare_and_check_host_arch(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") +declare_and_check_host_arch(SNB "Intel Sandy/Ivy Bridge CPUs") +declare_and_check_host_arch(HSW "Intel Haswell CPUs") +declare_and_check_host_arch(BDW "Intel Broadwell Xeon E-class CPUs") +declare_and_check_host_arch(ICL "Intel Ice Lake Client CPUs (AVX512)") +declare_and_check_host_arch(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(SKL "Intel Skylake Client CPUs") +declare_and_check_host_arch(SKX "Intel Skylake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(KNC "Intel Knights Corner Xeon Phi") +declare_and_check_host_arch(KNL "Intel Knights Landing Xeon Phi") +declare_and_check_host_arch(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(POWER8 "IBM POWER8 CPUs") +declare_and_check_host_arch(POWER9 "IBM POWER9 CPUs") +declare_and_check_host_arch(ZEN "AMD Zen architecture") +declare_and_check_host_arch(ZEN2 "AMD Zen2 architecture") +declare_and_check_host_arch(ZEN3 "AMD Zen3 architecture") +declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") + +if(Kokkos_ENABLE_CUDA + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_CUDA_ARCHS ON) +endif() + +kokkos_arch_option(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") + +if(Kokkos_ENABLE_HIP + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_HIP_ARCHS ON) +endif() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942) -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) -LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS RX7900XTX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS NAVI1100 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1100 gfx1100 gfx1030 gfx1030) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) +list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) +list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) +list(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) +list(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) +list(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) +list(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) +list(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - KOKKOS_ARCH_OPTION(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") -ENDFOREACH() - -IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) - SET(KOKKOS_SHOW_SYCL_ARCHS ON) -ENDIF() - -KOKKOS_ARCH_OPTION(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") - -IF(KOKKOS_ENABLE_COMPILER_WARNINGS) - SET(COMMON_WARNINGS - "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" - "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + kokkos_arch_option(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") +endforeach() + +if(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) + set(KOKKOS_SHOW_SYCL_ARCHS ON) +endif() + +kokkos_arch_option(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") + +if(KOKKOS_ENABLE_COMPILER_WARNINGS) + set(COMMON_WARNINGS + "-Wall" + "-Wextra" + "-Wunused-parameter" + "-Wshadow" + "-pedantic" + "-Wsign-compare" + "-Wtype-limits" + "-Wuninitialized" + "-Wsuggest-override" + ) # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH - IF(Kokkos_ENABLE_LIBQUADMATH) + if(Kokkos_ENABLE_LIBQUADMATH) # warning: non-standard suffix on floating constant [-Wpedantic] - LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") - ENDIF() + list(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + endif() # NVHPC compiler does not support -Wtype-limits. - IF(KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") - ENDIF() - ENDIF() - - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") - ENDIF() - - SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" - ${COMMON_WARNINGS}) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") + endif() + endif() + + # ICPC doesn't support -Wsuggest-override + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM COMMON_WARNINGS "-Wsuggest-override") + endif() + + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") + endif() + + set(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + list(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") + endif() # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream - IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU) - STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") - ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + string(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) # FIXME_NVHPC - ELSE() - STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") - ENDIF() - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") -ENDIF() - + else() + string(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") +endif() #------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") - ENDIF() -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-extended-lambda") + global_append(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") +endif() + +if(KOKKOS_ENABLE_CUDA_CONSTEXPR) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + endif() +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(CUDA_ARCH_FLAG "--cuda-gpu-arch") + global_append(KOKKOS_CUDA_OPTIONS -x cuda) # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR - IF (Kokkos_CUDA_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) - ELSEIF(CUDAToolkit_BIN_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) - ENDIF() - IF (KOKKOS_ENABLE_CUDA) - SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) - ENDIF() -ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - SET(CUDA_ARCH_FLAG "-gpu") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) - IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - SET(CUDA_ARCH_FLAG "-arch") -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) - IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) - ENDIF() - UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -ENDIF() - + if(Kokkos_CUDA_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + elseif(CUDAToolkit_BIN_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + set(CUDA_ARCH_FLAG "-arch") +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + string(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + if(KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + global_append(KOKKOS_CUDA_OPTIONS -lineinfo) + endif() + unset(_UPPERCASE_CMAKE_BUILD_TYPE) +endif() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- +kokkos_option(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +kokkos_option(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +mark_as_advanced(Kokkos_IMPL_AMDGPU_FLAGS) +mark_as_advanced(Kokkos_IMPL_AMDGPU_LINK) + #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_ENABLE_HIP) - SET(AMDGPU_ARCH_FLAG "--offload-arch") - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF (NOT CMAKE_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) - IF(DEFINED ENV{ROCM_PATH}) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) - ENDIF() - ENDIF() -ENDIF() - - -IF(KOKKOS_ARCH_NATIVE) - IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") - MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") - ENDIF() - - STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) - IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") - SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") - ELSE() - SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - NVHPC -tp=native - DEFAULT ${KOKKOS_NATIVE_FLAGS} +global_set(KOKKOS_AMDGPU_OPTIONS) +if(KOKKOS_ENABLE_HIP) + set(AMDGPU_ARCH_FLAG "--offload-arch") + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") + endif() + global_append(KOKKOS_AMDGPU_OPTIONS -xhip) + if(DEFINED ENV{ROCM_PATH}) + global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + endif() + endif() +endif() + +if(KOKKOS_ARCH_NATIVE) + if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + message(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") + endif() + + string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + if(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + set(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + else() + set(KOKKOS_NATIVE_FLAGS "-mcpu=native") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS}) +endif() + +if(KOKKOS_ARCH_ARMV80) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV80) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a +endif() + +if(KOKKOS_ARCH_ARMV81) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.1-a ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV81) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.1-a +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a + -mtune=thunderx ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a -mtune=thunderx +endif() + +if(KOKKOS_ARCH_ARMV8_THUNDERX2) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=thunderx2t99 + -mtune=thunderx2t99 ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV8_THUNDERX2) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 +endif() + +if(KOKKOS_ARCH_A64FX) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Clang + -march=armv8.2-a+sve + -msve-vector-bits=512 + GNU + -march=armv8.2-a+sve + -msve-vector-bits=512 + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.2-a+sve ) -ENDIF() - -IF (KOKKOS_ARCH_A64FX) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GNU -march=armv8.2-a+sve -msve-vector-bits=512 - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.2-a+sve +endif() + +if(KOKKOS_ARCH_ARMV9_GRACE) + set(KOKKOS_ARCH_ARM_NEON ON) + check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) + check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) + if(COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128) + else() + message(WARNING "Compiler does not support ARMv9 Grace architecture") + endif() +endif() + +if(KOKKOS_ARCH_ZEN) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen + DEFAULT + -march=znver1 + -mtune=znver1 ) -ENDIF() - -IF (KOKKOS_ARCH_ZEN) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen - DEFAULT -march=znver1 -mtune=znver1 + set(KOKKOS_ARCH_AMD_ZEN ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN2) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver2 + -mtune=znver2 ) - SET(KOKKOS_ARCH_AMD_ZEN ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN2) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver2 -mtune=znver2 + set(KOKKOS_ARCH_AMD_ZEN2 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_ZEN3) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver3 + -mtune=znver3 ) - SET(KOKKOS_ARCH_AMD_ZEN2 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_ZEN3) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver3 -mtune=znver3 + set(KOKKOS_ARCH_AMD_ZEN3 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() + +if(KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + set(KOKKOS_ARCH_AVX ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -mavx + MSVC + /arch:AVX + NVHPC + -tp=sandybridge + DEFAULT + -mavx ) - SET(KOKKOS_ARCH_AMD_ZEN3 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() - -IF (KOKKOS_ARCH_WSM) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSSE4.2 - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=px - DEFAULT -msse4.2 +endif() + +if(KOKKOS_ARCH_HSW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 ) - SET(KOKKOS_ARCH_SSE42 ON) -ENDIF() - -IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) - SET(KOKKOS_ARCH_AVX ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -mavx - MSVC /arch:AVX - NVHPC -tp=sandybridge - DEFAULT -mavx +endif() + +if(KOKKOS_ARCH_RISCV_SG2042) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() - -IF (KOKKOS_ARCH_HSW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -march=rv64imafdcv) +endif() + +if(KOKKOS_ARCH_RISCV_RVA22V) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() - -IF (KOKKOS_ARCH_BDW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT + -march=rv64imafdcv_sscofpmf_sstc_svpbmt_zicbom_zicboz_zicbop_zihintpause ) -ENDIF() - -IF (KOKKOS_ARCH_KNL) - #avx512-mic - SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xMIC-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=knl - DEFAULT -march=knl -mtune=knl +endif() + +if(KOKKOS_ARCH_BDW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 + -mrtm ) -ENDIF() +endif() -IF (KOKKOS_ARCH_KNC) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - DEFAULT -mmic +if(KOKKOS_ARCH_KNL) + #avx512-mic + set(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xMIC-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=knl + DEFAULT + -march=knl + -mtune=knl ) -ENDIF() - -IF (KOKKOS_ARCH_SKL) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSKYLAKE - MSVC /arch:AVX2 - NVHPC -tp=skylake - DEFAULT -march=skylake -mtune=skylake +endif() + +if(KOKKOS_ARCH_KNC) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC NO-VALUE-SPECIFIED DEFAULT -mmic) +endif() + +if(KOKKOS_ARCH_SKL) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xSKYLAKE + MSVC + /arch:AVX2 + NVHPC + -tp=skylake + DEFAULT + -march=skylake + -mtune=skylake ) -ENDIF() - -IF (KOKKOS_ARCH_SKX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=skylake - DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 +endif() + +if(KOKKOS_ARCH_SKX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=skylake + DEFAULT + -march=skylake-avx512 + -mtune=skylake-avx512 ) -ENDIF() - -IF (KOKKOS_ARCH_ICL) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-client -mtune=icelake-client +endif() + +if(KOKKOS_ARCH_ICL) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-client + -mtune=icelake-client ) -ENDIF() - -IF (KOKKOS_ARCH_ICX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-server -mtune=icelake-server +endif() + +if(KOKKOS_ARCH_ICX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-server + -mtune=icelake-server ) -ENDIF() - -IF (KOKKOS_ARCH_SPR) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=sapphirerapids -mtune=sapphirerapids +endif() + +if(KOKKOS_ARCH_SPR) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=sapphirerapids + -mtune=sapphirerapids ) -ENDIF() - -IF (KOKKOS_ARCH_POWER7) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=power7 -mtune=power7 +endif() + +if(KOKKOS_ARCH_POWER7) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=power7 + -mtune=power7 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER8) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr8 - DEFAULT -mcpu=power8 -mtune=power8 +endif() + +if(KOKKOS_ARCH_POWER8) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr8 + DEFAULT + -mcpu=power8 + -mtune=power8 ) -ENDIF() - -IF (KOKKOS_ARCH_POWER9) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr9 - DEFAULT -mcpu=power9 -mtune=power9 +endif() + +if(KOKKOS_ARCH_POWER9) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr9 + DEFAULT + -mcpu=power9 + -mtune=power9 ) -ENDIF() +endif() # If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect # the SIMD capabilities based on compiler macros. -IF (KOKKOS_ARCH_NATIVE) +if(KOKKOS_ARCH_NATIVE) # Make sure to rerun the checks if compile options have changed - IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") - SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") - - SET(CMAKE_REQUIRED_QUIET ON) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - - UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) - UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) - UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - - UNSET(CMAKE_REQUIRED_QUIET) - UNSET(CMAKE_REQUIRED_FLAGS) - ENDIF() + if(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + set(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + + set(CMAKE_REQUIRED_QUIET ON) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + + unset(KOKKOS_COMPILER_HAS_AVX512 CACHE) + check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) + check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + unset(KOKKOS_COMPILER_HAS_AVX CACHE) + check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + + unset(CMAKE_REQUIRED_QUIET) + unset(CMAKE_REQUIRED_FLAGS) + endif() # Only define one of these macros for now # to be uniform with what we are doing for other architectures. - IF(KOKKOS_COMPILER_HAS_AVX512) - MESSAGE(STATUS "SIMD: AVX512 detected") - SET(KOKKOS_ARCH_AVX512XEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX2) - MESSAGE(STATUS "SIMD: AVX2 detected") - SET(KOKKOS_ARCH_AVX2 ON) - ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) - MESSAGE(STATUS "SIMD: ARM_NEON detected") - SET(KOKKOS_ARCH_ARM_NEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX) - MESSAGE(STATUS "SIMD: AVX detected") - SET(KOKKOS_ARCH_AVX ON) - ENDIF() -ENDIF() + if(KOKKOS_COMPILER_HAS_AVX512) + message(STATUS "SIMD: AVX512 detected") + set(KOKKOS_ARCH_AVX512XEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX2) + message(STATUS "SIMD: AVX2 detected") + set(KOKKOS_ARCH_AVX2 ON) + elseif(KOKKOS_COMPILER_HAS_ARM_NEON) + message(STATUS "SIMD: ARM_NEON detected") + set(KOKKOS_ARCH_ARM_NEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX) + message(STATUS "SIMD: AVX detected") + set(KOKKOS_ARCH_AVX ON) + endif() +endif() # FIXME_NVHPC nvc++ doesn't seem to support AVX512. -IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON OFF) -ENDIF() - -IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - Clang -fcuda-rdc - NVIDIA --relocatable-device-code=true - NVHPC -gpu=rdc - ) - ELSEIF(KOKKOS_ENABLE_CUDA) - COMPILER_SPECIFIC_FLAGS( - NVHPC -gpu=nordc - ) - ENDIF() -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_ARCH_AVX512XEON OFF) +endif() + +# FIXME_NVCC nvcc doesn't seem to support Arm Neon. +if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + unset(KOKKOS_ARCH_ARM_NEON) +endif() + +if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(Clang -fcuda-rdc NVIDIA --relocatable-device-code=true) + endif() +endif() # Clang needs mcx16 option enabled for Windows atomic functions -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) - COMPILER_SPECIFIC_OPTIONS( - Clang -mcx16 - ) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + compiler_specific_options(Clang -mcx16) +endif() # MSVC ABI has many deprecation warnings, so ignore them -IF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - COMPILER_SPECIFIC_DEFS( - Clang _CRT_SECURE_NO_WARNINGS - ) -ENDIF() - +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) +endif() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (KOKKOS_ENABLE_HIP) - IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fno-gpu-rdc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization - ) - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-unnamed-lambda - ) -ENDIF() +if(KOKKOS_ENABLE_HIP) + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(DEFAULT -fgpu-rdc) + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + compiler_specific_link_options(DEFAULT --hip-link) + endif() + else() + compiler_specific_flags(DEFAULT -fno-gpu-rdc) + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + compiler_specific_flags(DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization) + compiler_specific_options(DEFAULT -fsycl-unnamed-lambda) + if(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2024.1.0) + # Before oneAPI 2024.1.0 passing -fno-sycl didn't work properly + if(NOT KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + message(FATAL_ERROR "Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=OFF requires oneAPI 2024.1.0 or later") + endif() + elseif(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT -fsycl-rdc) + else() + compiler_specific_options(DEFAULT -fno-sycl-rdc) + endif() +endif() # Check support for device_global variables -# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is -# available, use that instead. -IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) - STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - CHECK_CXX_SOURCE_COMPILES(" - #include <sycl/sycl.hpp> - using namespace sycl::ext::oneapi::experimental; - using namespace sycl; - - SYCL_EXTERNAL device_global<int, decltype(properties(device_image_scope))> Foo; - - void bar(queue q) { - q.single_task([=] { - Foo = 42; - }); - } - - int main(){ return 0; } - " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED +# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device +# global variables with shared libraries using the "non-separable compilation" +# implementation. Otherwise, the feature is not supported when building shared +# libraries. Thus, we don't even check for support if shared libraries are +# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. +if(KOKKOS_ENABLE_SYCL) + string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. + compiler_specific_flags(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + elseif(NOT BUILD_SHARED_LIBS AND KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include <sycl/sycl.hpp> + using namespace sycl::ext::oneapi::experimental; + using namespace sycl; + + SYCL_EXTERNAL device_global<int, decltype(properties(device_image_scope))> Foo; + + void bar(queue q) { + q.single_task([=] { + Foo = 42; + }); + } + + int main(){ return 0; } + " + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ) - ENDIF() -ENDIF() - -SET(CUDA_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) - MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_CUDA) - STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) - IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) - ELSE() - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${FLAG}) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${NVHPC_CUDA_ARCH}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${NVHPC_CUDA_ARCH}") - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() - ENDIF() - ENDIF() - LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) - SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) - LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) - SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) -ENDFUNCTION() + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + # Only the separable compilation implementation is supported. + compiler_specific_flags(DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + endif() + endif() + + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_GRAPH "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) +endif() + +set(CUDA_ARCH_ALREADY_SPECIFIED "") +function(CHECK_CUDA_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(CUDA_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_CUDA + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_SYCL + AND NOT KOKKOS_ENABLE_OPENACC + ) + message( + WARNING + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_CUDA) + string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + endif() + set(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + if(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + set(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) + else() + global_append(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() + endif() + list(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + set(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + list(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + set(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +endfunction() #These will define KOKKOS_CUDA_ARCH_FLAG #to the corresponding flag name if ON -CHECK_CUDA_ARCH(KEPLER30 sm_30) -CHECK_CUDA_ARCH(KEPLER32 sm_32) -CHECK_CUDA_ARCH(KEPLER35 sm_35) -CHECK_CUDA_ARCH(KEPLER37 sm_37) -CHECK_CUDA_ARCH(MAXWELL50 sm_50) -CHECK_CUDA_ARCH(MAXWELL52 sm_52) -CHECK_CUDA_ARCH(MAXWELL53 sm_53) -CHECK_CUDA_ARCH(PASCAL60 sm_60) -CHECK_CUDA_ARCH(PASCAL61 sm_61) -CHECK_CUDA_ARCH(VOLTA70 sm_70) -CHECK_CUDA_ARCH(VOLTA72 sm_72) -CHECK_CUDA_ARCH(TURING75 sm_75) -CHECK_CUDA_ARCH(AMPERE80 sm_80) -CHECK_CUDA_ARCH(AMPERE86 sm_86) -CHECK_CUDA_ARCH(ADA89 sm_89) -CHECK_CUDA_ARCH(HOPPER90 sm_90) - -SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) - MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() +check_cuda_arch(KEPLER30 sm_30) +check_cuda_arch(KEPLER32 sm_32) +check_cuda_arch(KEPLER35 sm_35) +check_cuda_arch(KEPLER37 sm_37) +check_cuda_arch(MAXWELL50 sm_50) +check_cuda_arch(MAXWELL52 sm_52) +check_cuda_arch(MAXWELL53 sm_53) +check_cuda_arch(PASCAL60 sm_60) +check_cuda_arch(PASCAL61 sm_61) +check_cuda_arch(VOLTA70 sm_70) +check_cuda_arch(VOLTA72 sm_72) +check_cuda_arch(TURING75 sm_75) +check_cuda_arch(AMPERE80 sm_80) +check_cuda_arch(AMPERE86 sm_86) +check_cuda_arch(ADA89 sm_89) +check_cuda_arch(HOPPER90 sm_90) + +set(AMDGPU_ARCH_ALREADY_SPECIFIED "") +function(CHECK_AMDGPU_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(AMDGPU_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_HIP + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_OPENACC + AND NOT KOKKOS_ENABLE_SYCL + ) + message( + WARNING + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_HIP) + set(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + endif() + if(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + set(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + global_append(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + global_append(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() +endfunction() #These will define KOKKOS_AMDGPU_ARCH_FLAG #to the corresponding flag name if ON -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) -ENDFOREACH() - -MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) - KOKKOS_SET_OPTION(ARCH_${ARCH} ON) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) -ENDMACRO() - -MACRO(CHECK_MULTIPLE_INTEL_ARCH) - IF(KOKKOS_ARCH_INTEL_GPU) - MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") - ENDIF() - SET(KOKKOS_ARCH_INTEL_GPU ON) -ENDMACRO() - -IF(KOKKOS_ARCH_INTEL_GEN) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_DG1) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN9) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN11) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN12LP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_XEHP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_PVC) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - IF (CLANG_CUDA_ARCH) - IF(KOKKOS_CLANG_IS_CRAY) - COMPILER_SPECIFIC_FLAGS( - Cray -fopenmp - ) - ELSE() - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 - NVHPC -gpu=${NVHPC_CUDA_ARCH} +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + check_amdgpu_arch(${ARCH} ${FLAG}) +endforeach() + +if(KOKKOS_IMPL_AMDGPU_FLAGS) + if(NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + message(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture." + ) + endif() + global_append(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + global_append(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +endif() + +macro(SET_AND_CHECK_AMD_ARCH ARCH FLAG) + kokkos_set_option(ARCH_${ARCH} ON) + check_amdgpu_arch(${ARCH} ${FLAG}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) +endmacro() + +macro(CHECK_MULTIPLE_INTEL_ARCH) + if(KOKKOS_ARCH_INTEL_GPU) + message(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + endif() + set(KOKKOS_ARCH_INTEL_GPU ON) +endmacro() + +if(KOKKOS_ARCH_INTEL_GEN) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_DG1) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN9) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN11) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN12LP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_XEHP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_PVC) + check_multiple_intel_arch() +endif() + +if(KOKKOS_ENABLE_OPENMP) + compiler_specific_link_options(CrayClang -fopenmp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + if(CLANG_CUDA_ARCH) + if(KOKKOS_CLANG_IS_CRAY) + compiler_specific_flags(Cray -fopenmp) + else() + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) + compiler_specific_flags( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} ) - ENDIF() - ENDIF() - SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - IF (CLANG_AMDGPU_ARCH) - COMPILER_SPECIFIC_FLAGS( + endif() + endif() + set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + if(CLANG_AMDGPU_ARCH) + compiler_specific_flags( Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) - ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" -D__STRICT_ANSI__ - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CUDA_ARCH_FLAG) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc -gpu=${NVHPC_CUDA_ARCH} - Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH} - -fopenmp-targets=nvptx64-nvidia-cuda - ) - ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} - -fopenmp-targets=amdgcn-amd-amdhsa - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} + endif() + if(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__) + else() + compiler_specific_options(IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__) + if(KOKKOS_ARCH_INTEL_GEN9) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7") + endif() + endif() +endif() + +if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CUDA_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} + endif() + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + compiler_specific_flags( + NVHPC + -acc + -gpu=${NVHPC_CUDA_ARCH} + Clang + -Xopenmp-target=nvptx64-nvidia-cuda + -march=${CLANG_CUDA_ARCH} + -fopenmp-targets=nvptx64-nvidia-cuda + ) + if(DEFINED ENV{CUDA_PATH}) + compiler_specific_link_options(Clang -L$ENV{CUDA_PATH}/lib64) + endif() + compiler_specific_libs(Clang -lcudart NVHPC -cuda) + elseif(KOKKOS_AMDGPU_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64 + endif() + compiler_specific_flags( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} -fopenmp-targets=amdgcn-amd-amdhsa ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen + if(DEFINED ENV{ROCM_PATH}) + compiler_specific_flags(Clang -I$ENV{ROCM_PATH}/include) + compiler_specific_link_options(Clang -L$ENV{ROCM_PATH}/lib) + endif() + compiler_specific_libs(Clang -lamdhip64) + elseif(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + compiler_specific_flags(NVHPC -acc=multicore) + else() + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + compiler_specific_flags(NVHPC -acc=gpu,multicore) + message( + STATUS + "No OpenACC target device is specificed; the OpenACC backend will be executed in an automatic fallback mode." ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + if(CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda + --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" + else() + message( + SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" + endif() + elseif(AMDGPU_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" + else() + message( + SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ENDIF() - ENDIF() -ENDIF() - -IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + endif() + elseif(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(DEFAULT -fsycl-targets=spir64) + elseif(KOKKOS_ARCH_INTEL_GPU) + set(SYCL_TARGET_FLAG -fsycl-targets=spir64_gen) + + if(KOKKOS_ARCH_INTEL_GEN9) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.60.7") + endif() + + if(Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG}) + compiler_specific_link_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + else() + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + endif() + endif() +endif() + +if(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device - SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) - FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) - FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) - - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + set(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + file(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) # if user is using kokkos_compiler_launcher, above will fail. - IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + if(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) - GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + get_property(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough - IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + if(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) # make sure the user knows that we aren't using CUDA compiler for anything else - MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") - INCLUDE(CheckLanguage) - CHECK_LANGUAGE(CUDA) - IF(CMAKE_CUDA_COMPILER) - ENABLE_LANGUAGE(CUDA) - ELSE() - MESSAGE(STATUS "CUDA language could not be enabled") - ENDIF() - ENDIF() + message( + STATUS + "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture..." + ) + include(CheckLanguage) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "CUDA language could not be enabled") + endif() + endif() # if CUDA was enabled, this will be defined - IF(CMAKE_CUDA_COMPILER) + if(CMAKE_CUDA_COMPILER) # copy our test to .cu so cmake compiles as CUDA - CONFIGURE_FILE( + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COPYONLY + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COPYONLY ) # run test again - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) - ENDIF() - ENDIF() - - LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) - IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) - MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") - LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) - KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) - CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) - ELSE() - MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " - "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" - "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " - "If you are cross-compiling, you should try to do this on a compute node.") - ENDIF() -ENDIF() + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) + endif() + endif() + + list(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + if(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + message(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + list(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + kokkos_set_option(ARCH_${ARCHITECTURE} ON) + check_cuda_arch(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + else() + message( + SEND_ERROR + "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node." + ) + endif() +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) - SET(KOKKOS_ARCH_KEPLER ON) -ENDIF() +if(KOKKOS_ARCH_KEPLER30 + OR KOKKOS_ARCH_KEPLER32 + OR KOKKOS_ARCH_KEPLER35 + OR KOKKOS_ARCH_KEPLER37 +) + set(KOKKOS_ARCH_KEPLER ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) - SET(KOKKOS_ARCH_MAXWELL ON) -ENDIF() +if(KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + set(KOKKOS_ARCH_MAXWELL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) - SET(KOKKOS_ARCH_PASCAL ON) -ENDIF() +if(KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + set(KOKKOS_ARCH_PASCAL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) - SET(KOKKOS_ARCH_VOLTA ON) -ENDIF() +if(KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + set(KOKKOS_ARCH_VOLTA ON) +endif() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) -ENDIF() +if(KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + set(KOKKOS_ARCH_AMPERE ON) +endif() -IF (KOKKOS_ARCH_HOPPER90) - SET(KOKKOS_ARCH_HOPPER ON) -ENDIF() +if(KOKKOS_ARCH_HOPPER90) + set(KOKKOS_ARCH_HOPPER ON) +endif() + +function(CHECK_AMD_APU ARCH) + set(BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/AmdApuWorkdir) + file(REMOVE_RECURSE ${BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${BINARY_TEST_DIR}) + + try_run(RESULT COMPILE_RESULT ${BINARY_TEST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/amd_apu.cc + RUN_OUTPUT_VARIABLE AMD_APU + ) + + if(NOT COMPILE_RESULT OR NOT RESULT EQUAL 0) + message(SEND_ERROR "Autodetection of AMD APU failed." + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + + if(AMD_APU) + set(${ARCH} AMD_GFX942_APU PARENT_SCOPE) + endif() +endfunction() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) - IF(NOT ROCM_ENUMERATOR) - MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " - "rocm_agent_enumerator could not be found. " - "Please specify an arch manually via -DKokkos_ARCH_{..}=ON") - ELSE() - EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) - STRING(LENGTH "${GPU_ARCHS}" len_str) +if(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + find_program(ROCM_ENUMERATOR rocm_agent_enumerator) + if(NOT ROCM_ENUMERATOR) + message( + FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " "rocm_agent_enumerator could not be found. " + "Please specify an arch manually via -DKokkos_ARCH_{..}=ON" + ) + else() + execute_process(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + string(LENGTH "${GPU_ARCHS}" len_str) # enumerator always output gfx000 as the first line - IF(${len_str} LESS 8) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - # check for known gpu archs, otherwise error out - ELSE() - SET(AMD_ARCH_DETECTED "") - FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - STRING(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) - IF("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") - SET_AND_CHECK_AMD_ARCH(${ARCH} ${FLAG}) - SET(AMD_ARCH_DETECTED ${ARCH}) - BREAK() - ENDIF() - ENDFOREACH() - IF("${AMD_ARCH_DETECTED}" STREQUAL "") - MESSAGE(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " - "is supported. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - ENDIF() - ENDIF() - ENDIF() -ENDIF() - -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - STRING(REGEX MATCH "90A" IS_90A ${ARCH}) - IF(IS_90A) - SET(KOKKOS_ARCH_AMD_GFX90A ON) - SET(KOKKOS_ARCH_VEGA90A ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "908" IS_908 ${ARCH}) - IF(IS_908) - SET(KOKKOS_ARCH_AMD_GFX908 ON) - SET(KOKKOS_ARCH_VEGA908 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "906" IS_906 ${ARCH}) - IF(IS_906) - SET(KOKKOS_ARCH_AMD_GFX906 ON) - SET(KOKKOS_ARCH_VEGA906 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1100" IS_1100 ${ARCH}) - IF(IS_1100) - SET(KOKKOS_ARCH_AMD_GFX1100 ON) - SET(KOKKOS_ARCH_NAVI1100 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1030" IS_1030 ${ARCH}) - IF(IS_1030) - SET(KOKKOS_ARCH_AMD_GFX1030 ON) - SET(KOKKOS_ARCH_NAVI1030 ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() + if(${len_str} LESS 8) + message(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + # check for known gpu archs, otherwise error out + else() + set(AMD_ARCH_DETECTED "") + foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + string(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) + if("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") + # If we detected gfx942, we need to discriminate between APU and discrete GPU + if(FLAG STREQUAL "gfx942") + check_amd_apu(ARCH) + endif() + set_and_check_amd_arch(${ARCH} ${FLAG}) + set(AMD_ARCH_DETECTED ${ARCH}) + break() + endif() + endforeach() + if("${AMD_ARCH_DETECTED}" STREQUAL "") + message(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " "is supported. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + endif() + endif() +endif() + +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + string(REGEX MATCH "90A" IS_90A ${ARCH}) + if(IS_90A) + set(KOKKOS_ARCH_AMD_GFX90A ON) + set(KOKKOS_ARCH_VEGA90A ON) + break() + endif() + string(REGEX MATCH "908" IS_908 ${ARCH}) + if(IS_908) + set(KOKKOS_ARCH_AMD_GFX908 ON) + set(KOKKOS_ARCH_VEGA908 ON) + break() + endif() + string(REGEX MATCH "906" IS_906 ${ARCH}) + if(IS_906) + set(KOKKOS_ARCH_AMD_GFX906 ON) + set(KOKKOS_ARCH_VEGA906 ON) + break() + endif() + string(REGEX MATCH "1100" IS_1100 ${ARCH}) + if(IS_1100) + set(KOKKOS_ARCH_AMD_GFX1100 ON) + set(KOKKOS_ARCH_NAVI1100 ON) + break() + endif() + string(REGEX MATCH "1030" IS_1030 ${ARCH}) + if(IS_1030) + set(KOKKOS_ARCH_AMD_GFX1030 ON) + set(KOKKOS_ARCH_NAVI1030 ON) + break() + endif() + endif() +endforeach() #Regardless of version, make sure we define the general architecture name -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - SET(KOKKOS_ARCH_AMD_GPU ON) - STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) - IF(IS_VEGA) - SET(KOKKOS_ARCH_VEGA ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) - IF(IS_NAVI) - SET(KOKKOS_ARCH_NAVI ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + set(KOKKOS_ARCH_AMD_GPU "${FLAG}") + string(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) + if(IS_VEGA) + set(KOKKOS_ARCH_VEGA ON) + break() + endif() + string(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) + if(IS_NAVI) + set(KOKKOS_ARCH_NAVI ON) + break() + endif() + endif() +endforeach() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Built-in Execution Spaces:") - -FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_DEVICE_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_DEVICE_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "Cuda") - IF(KOKKOS_ENABLE_CUDA_UVM) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead") - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - ENDIF() - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - ENDIF() - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSEIF(${_BACKEND} STREQUAL "HIP") - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() -IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NoTypeDefined") - SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") -ENDIF() -MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") - -FOREACH (_BACKEND OpenMP Threads HPX) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_HOST_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_HOST_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "HPX") - SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ELSE() - SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() - -IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " - "but no host parallel execution space was requested " - "and Kokkos_ENABLE_SERIAL=OFF.") -ENDIF() - -IF(_HOST_PARALLEL) -MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -ELSE() - SET(_HOST_PARALLEL "NoTypeDefined") - MESSAGE(STATUS " Host Parallel: NoTypeDefined") -ENDIF() - -IF(KOKKOS_ENABLE_SERIAL) - MESSAGE(STATUS " Host Serial: SERIAL") -ELSE() - MESSAGE(STATUS " Host Serial: NONE") -ENDIF() - -MESSAGE(STATUS "") -MESSAGE(STATUS "Architectures:") -FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) - MESSAGE(STATUS " ${Arch}") -ENDFOREACH() +message(STATUS "Built-in Execution Spaces:") + +foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_DEVICE_PARALLEL) + message( + FATAL_ERROR + "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "Cuda") + if(KOKKOS_ENABLE_CUDA_UVM) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" + ) + if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") + endif() + endif() + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + else() + set(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + endif() + endif() +endforeach() +if(NOT _DEVICE_PARALLEL) + set(_DEVICE_PARALLEL "NoTypeDefined") +endif() +message(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") + +foreach(_BACKEND OpenMP Threads HPX) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_HOST_PARALLEL) + message( + FATAL_ERROR + "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "HPX") + set(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + else() + set(_HOST_PARALLEL "Kokkos::${_BACKEND}") + endif() + endif() +endforeach() + +if(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " "and Kokkos_ENABLE_SERIAL=OFF." + ) +endif() + +if(_HOST_PARALLEL) + message(STATUS " Host Parallel: ${_HOST_PARALLEL}") +else() + set(_HOST_PARALLEL "NoTypeDefined") + message(STATUS " Host Parallel: NoTypeDefined") +endif() + +if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: SERIAL") +else() + message(STATUS " Host Serial: NONE") +endif() + +message(STATUS "") +message(STATUS "Architectures:") +foreach(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + message(STATUS " ${Arch}") +endforeach() + +if(KOKKOS_ENABLE_ATOMICS_BYPASS) + if(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + message( + FATAL_ERROR + "Disabling atomics (via -DKokkos_ENABLE_ATOMICS_BYPASS=ON) is not allowed if a host parallel or a device backend is enabled!" + ) + endif() + if(NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "Implementation bug") # safeguard + endif() + message(STATUS "Atomics: **DISABLED**") +endif() diff --git a/packages/kokkos/cmake/kokkos_check_env.cmake b/packages/kokkos/cmake/kokkos_check_env.cmake index a455a403b9d5ed0fa3772d2d8e619347061bd65e..f1a309ff85799b4f242a849a09f6da041a058304 100644 --- a/packages/kokkos/cmake/kokkos_check_env.cmake +++ b/packages/kokkos/cmake/kokkos_check_env.cmake @@ -1,12 +1,15 @@ -SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) -IF (CRAYPE_VERSION) - SET(KOKKOS_IS_CRAYPE TRUE) - SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) - IF (CRAYPE_LINK_TYPE) - IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") - MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() - ELSE() - MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() -ENDIF() +set(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +if(CRAYPE_VERSION) + set(KOKKOS_IS_CRAYPE TRUE) + set(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + if(CRAYPE_LINK_TYPE) + if(NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + message( + WARNING + "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'" + ) + endif() + else() + message(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + endif() +endif() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake index 04589befc3ada08c204a8242b096501723728d01..010ed33ede890f1e9d214cd0a34e26f38aa87f97 100644 --- a/packages/kokkos/cmake/kokkos_compiler_id.cmake +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -1,232 +1,273 @@ -KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) +kokkos_cfg_depends(COMPILER_ID NONE) -SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) -SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) -SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) +set(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +set(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +set(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -MACRO(kokkos_internal_have_compiler_nvcc) +macro(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(INTERNAL_HAVE_COMPILER_NVCC true) - ELSE() - SET(INTERNAL_HAVE_COMPILER_NVCC false) - ENDIF() -ENDMACRO() - -IF(Kokkos_ENABLE_CUDA) + execute_process(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + string(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + if(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + set(INTERNAL_HAVE_COMPILER_NVCC true) + else() + set(INTERNAL_HAVE_COMPILER_NVCC false) + endif() +endmacro() + +if(Kokkos_ENABLE_CUDA) # kokkos_enable_options is not yet called so use lower case here - IF(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) kokkos_internal_have_compiler_nvcc(${CMAKE_CUDA_COMPILER}) - ELSE() + else() # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) # Check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) - IF(CMAKE_CXX_COMPILER_LAUNCHER) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - MESSAGE(STATUS "Using nvc++ as device compiler requires Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON!") - ENDIF() - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") - ENDIF() + if(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_LAUNCHER) + message( + FATAL_ERROR + "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!" + ) + endif() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) - SET(INTERNAL_USE_COMPILER_LAUNCHER true) - ENDIF() - ENDIF() -ENDIF() + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} + -DKOKKOS_DEPENDENCE + ) + set(INTERNAL_USE_COMPILER_LAUNCHER true) + endif() + endif() +endif() -IF(INTERNAL_HAVE_COMPILER_NVCC) +if(INTERNAL_HAVE_COMPILER_NVCC) # Save the host compiler id before overwriting it. - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) # SET the compiler id to nvcc. We use the value used by CMake 3.8. - SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) - - STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") - IF(INTERNAL_USE_COMPILER_LAUNCHER) - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + set(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + + string(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + string(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + if(INTERNAL_USE_COMPILER_LAUNCHER) + message(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) - ENDIF() -ENDIF() + endif() +endif() -IF(Kokkos_ENABLE_HIP) +if(Kokkos_ENABLE_HIP) # get HIP version - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) - IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) - SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) - ENDIF() + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + if(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + set(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + endif() - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c Cray - OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_CRAY TRUE) - ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_CRAY TRUE) + set(KOKKOS_CXX_COMPILER_ID CrayClang) + endif() # The clang based Intel compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c "DPC++\\|icpx" - OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - ENDIF() -ENDIF() - -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_INTEL TRUE) + set(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) # SET Cray's compiler version. - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - IF (KOKKOS_CLANG_IS_CRAY) - SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) - ELSE() - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - ENDIF() -ENDIF() - -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + if(KOKKOS_CLANG_IS_CRAY) + set(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + else() + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + endif() +endif() + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) # SET Fujitsus compiler version which is not detected by CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +endif() # Enforce the minimum compilers supported by Kokkos. -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) 8.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) 2023.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 11.0.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 5.2.0 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI 22.3 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC 19.29 or higher") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") - -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 8.0.0) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 10.0.0) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 8.2.0) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 19.0.5) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.1.1) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2023.0.0) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.0.0) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() - SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 5.2.0) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 22.3) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +if(CMAKE_CXX_STANDARD EQUAL 17) + set(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 8.2.0) + set(KOKKOS_INTEL_MINIMUM 19.0.5) + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 11.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.29) +else() + set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 10.1.0) + set(KOKKOS_INTEL_MINIMUM "not supported") + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 12.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.30) +endif() + +set(KOKKOS_MESSAGE_TEXT + "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:" +) +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") + +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + if((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() # Treat PGI internally as NVHPC to simplify handling both compilers. # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is # backward-compatible to pgc++. - SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 19.29) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -ENDIF() - -IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) -ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) - SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ENDIF() - -STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) -LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) -LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) -LIST(LENGTH VERSION_LIST LIST_LENGTH) + set(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +endif() -# On Android, the compiler doesn't have a patch version, just a major/minor -IF(LIST_LENGTH GREATER 2) - LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) -ELSE() - SET(KOKKOS_COMPILER_VERSION_PATCH 0) -ENDIF() +if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +elseif(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + set(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +endif() +string(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +list(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +list(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +list(LENGTH VERSION_LIST LIST_LENGTH) + +# On Android, the compiler doesn't have a patch version, just a major/minor +if(LIST_LENGTH GREATER 2) + list(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) +else() + set(KOKKOS_COMPILER_VERSION_PATCH 0) +endif() diff --git a/packages/kokkos/cmake/kokkos_configure_trilinos.cmake b/packages/kokkos/cmake/kokkos_configure_trilinos.cmake new file mode 100644 index 0000000000000000000000000000000000000000..5aeef61e7b3220bbbd30e4cddda3b9d7290ab920 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_configure_trilinos.cmake @@ -0,0 +1,38 @@ +if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Whether to build Serial backend" FORCE) + + if(NOT ${Trilinos_ENABLE_OpenMP} STREQUAL "") + set(Kokkos_ENABLE_OPENMP ${Trilinos_ENABLE_OpenMP} CACHE BOOL "Whether to build OpenMP backend" FORCE) + else() + set(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "Whether to build OpenMP backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_CUDA} STREQUAL "") + set(Kokkos_ENABLE_CUDA ${TPL_ENABLE_CUDA} CACHE BOOL "Whether to build CUDA backend" FORCE) + else() + set(Kokkos_ENABLE_CUDA OFF CACHE BOOL "Whether to build CUDA backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_HPX} STREQUAL "") + set(Kokkos_ENABLE_HPX ${TPL_ENABLE_HPX} CACHE BOOL "Whether to build HPX backend" FORCE) + else() + set(Kokkos_ENABLE_HPX OFF CACHE BOOL "Whether to build HPX backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_quadmath} STREQUAL "") + set(Kokkos_ENABLE_LIBQUADMATH ${TPL_ENABLE_quadmath} CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + else() + set(Kokkos_ENABLE_LIBQUADMATH OFF CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + endif() + + if(NOT ${TPL_ENABLE_DLlib} STREQUAL "") + set(Kokkos_ENABLE_LIBDL ${TPL_ENABLE_DLlib} CACHE BOOL "Whether to enable the LIBDL library" FORCE) + else() + set(Kokkos_ENABLE_LIBDL OFF CACHE BOOL "Whether to enable the LIBDL library" FORCE) + endif() + + set(Kokkos_ENABLE_COMPLEX_ALIGN OFF CACHE BOOL "Whether to align Kokkos::complex to 2*alignof(RealType)") + + # FIXME_TRILINOS We run into problems when trying to use an external GTest in Trilinos CI + set(CMAKE_DISABLE_FIND_PACKAGE_GTest ON) +endif() diff --git a/packages/kokkos/cmake/kokkos_corner_cases.cmake b/packages/kokkos/cmake/kokkos_corner_cases.cmake index ede2b4e0caf8661ece061c19eb81ae501082b73e..530e9e8fd8e0dee6a3c8eadf15022f5d4474c984 100644 --- a/packages/kokkos/cmake/kokkos_corner_cases.cmake +++ b/packages/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,8 @@ -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.2) - MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496") -ENDIF() - +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 11.2 +) + message( + WARNING + "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496" + ) +endif() diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake index 9a977520a3a02e37e0d1dd2c9dac38d166aa7bfb..40c2d3ea8afbc2cdae852b804b2517ea7ab4f824 100644 --- a/packages/kokkos/cmake/kokkos_enable_devices.cmake +++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake @@ -1,125 +1,132 @@ - -FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) - LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) +function(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME}) + list(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) - IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") - SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + if(KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + set(KOKKOS_HAS_HOST ON PARENT_SCOPE) + endif() +endfunction() -KOKKOS_CFG_DEPENDS(DEVICES NONE) +kokkos_cfg_depends(DEVICES NONE) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) - +kokkos_deprecated_list(DEVICES ENABLE) -KOKKOS_DEVICE_OPTION(THREADS OFF HOST "Whether to build C++ threads backend") +kokkos_device_option(THREADS OFF HOST "Whether to build C++ threads backend") # detect clang++ / cl / clang-cl clashes -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") # this specific test requires CMake >= 3.15 - IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + if("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") # use pure clang++ instead of clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC OFF) - ELSE() + set(KOKKOS_COMPILER_CLANG_MSVC OFF) + else() # it defaults to clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC ON) - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) - SET(OMP_DEFAULT ON) -ELSE() - SET(OMP_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") - -KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") -IF (KOKKOS_ENABLE_OPENACC) - COMPILER_SPECIFIC_FLAGS( - Clang -fopenacc -fopenacc-fake-async-wait - -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version - -Wno-pass-failed - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -ENDIF() - -KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(ClangOpenMPFlag -fopenmp=libomp) - IF(KOKKOS_CLANG_IS_CRAY) - SET(ClangOpenMPFlag -fopenmp) - ENDIF() - - COMPILER_SPECIFIC_FLAGS( - Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelLLVM -fiopenmp -Wno-openmp-mapping - NVHPC -mp=gpu - DEFAULT -fopenmp - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -# Are there compilers which identify as Clang and need this library? -# COMPILER_SPECIFIC_LIBS( -# Clang -lopenmptarget -# ) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_DEFAULT ON) -ELSE() - SET(CUDA_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - -IF (KOKKOS_ENABLE_CUDA) - GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") -## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros - LIST(APPEND DEVICE_SETUP_LIST Cuda) -ENDIF() + set(KOKKOS_COMPILER_CLANG_MSVC ON) + endif() +endif() + +if(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + set(OMP_DEFAULT ON) +else() + set(OMP_DEFAULT OFF) +endif() +kokkos_device_option(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") # We want this to default to OFF for cache reasons, but if no # host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") - -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") - -KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") +if(KOKKOS_HAS_HOST) + set(SERIAL_DEFAULT OFF) +else() + set(SERIAL_DEFAULT ON) + if(NOT DEFINED Kokkos_ENABLE_SERIAL) + message( + STATUS + "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt" + ) + endif() +endif() +kokkos_device_option(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +kokkos_device_option(HPX OFF HOST "Whether to build HPX backend (experimental)") + +# Device backends have to come after host backends for header include order reasons +# Without this we can't make e.g. CudaSpace accessible by HostSpace +kokkos_device_option(OPENACC OFF DEVICE "Whether to build the OpenACC backend") +if(KOKKOS_ENABLE_OPENACC) + compiler_specific_flags( + Clang + -fopenacc + -fopenacc-fake-async-wait + -fopenacc-implicit-worker=vector + -Wno-openacc-and-cxx + -Wno-openmp-mapping + -Wno-unknown-cuda-version + -Wno-pass-failed + ) + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) +endif() + +kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +if(KOKKOS_ENABLE_OPENMPTARGET) + set(ClangOpenMPFlag -fopenmp=libomp) + if(KOKKOS_CLANG_IS_CRAY) + set(ClangOpenMPFlag -fopenmp) + endif() + + compiler_specific_flags( + Clang + ${ClangOpenMPFlag} + -Wno-openmp-mapping + IntelLLVM + -fiopenmp + -Wno-openmp-mapping + NVHPC + -mp=gpu + DEFAULT + -fopenmp + ) + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) + # Are there compilers which identify as Clang and need this library? + # COMPILER_SPECIFIC_LIBS( + # Clang -lopenmptarget + # ) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + endif() +endif() + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + set(CUDA_DEFAULT ON) +else() + set(CUDA_DEFAULT OFF) +endif() +kokkos_device_option(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") + +if(KOKKOS_ENABLE_CUDA) + global_set(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + ## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + list(APPEND DEVICE_SETUP_LIST Cuda) +endif() + +kokkos_device_option(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros -IF (KOKKOS_ENABLE_HIP) - LIST(APPEND DEVICE_SETUP_LIST HIP) -ENDIF() +if(KOKKOS_ENABLE_HIP) + list(APPEND DEVICE_SETUP_LIST HIP) +endif() -KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") +kokkos_device_option(SYCL OFF DEVICE "Whether to build SYCL backend") ## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros -IF (KOKKOS_ENABLE_SYCL) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") - ENDIF() - LIST(APPEND DEVICE_SETUP_LIST SYCL) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "SYCL backend requires C++17 or newer!") + endif() + list(APPEND DEVICE_SETUP_LIST SYCL) +endif() diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake index 89e23b019bdca0a3084ce3795bf7f3bde76baf66..a5d6fdfe4edde7480a8b13ad387f08abf0bdc8e0 100644 --- a/packages/kokkos/cmake/kokkos_enable_options.cmake +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -1,191 +1,236 @@ ########################## NOTES ############################################### # List the options for configuring kokkos using CMake method of doing it. -# These options then get mapped onto KOKKOS_SETTINGS environment variable by -# kokkos_settings.cmake. It is separate to allow other packages to override -# these variables (e.g., TriBITS). ########################## AVAILABLE OPTIONS ################################### # Use lists for documentation, verification, and programming convenience - -FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) +function(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + list(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) -ENDFUNCTION() + set(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +endfunction() # Certain defaults will depend on knowing the enabled devices -KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) -KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) +kokkos_cfg_depends(OPTIONS DEVICES) +kokkos_cfg_depends(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +kokkos_deprecated_list(OPTIONS ENABLE) -KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") # In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. # That is problematic when CUDA is not enabled because this not only yields a # bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. This if-clause is a crutch that delays the refactoring of the -# way we declare all options until after we get rid of TriBITS. -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") - -# May be used to disable our use of CudaMallocAsync. It had caused issues in -# the past when UCX was used as MPI communication layer. We expect it is -# resolved but we keep the option around a bit longer to be safe. -KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major release 3 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) -KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") -KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") -KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") -STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) -IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") -ELSE() - KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") -ENDIF() -UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") -KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") -KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") -KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") -KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") -KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") +# sets it to ON. +kokkos_enable_option( + CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" +) + +# As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX +# as MPI communication layer. +kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") + +kokkos_enable_option(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available") +kokkos_enable_option(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings") +kokkos_enable_option(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") + +# Disabling RDC only works properly since oneAPI 2024.1.0 +if(KOKKOS_ENABLE_SYCL AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 2024.1.0 +) + set(SYCL_RDC_DEFAULT ON) +else() + set(SYCL_RDC_DEFAULT OFF) +endif() +kokkos_enable_option( + SYCL_RELOCATABLE_DEVICE_CODE ${SYCL_RDC_DEFAULT} "Whether to enable relocatable device code (RDC) for SYCL" +) +kokkos_enable_option(TESTS OFF "Whether to build the unit tests") +kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") +kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + kokkos_enable_option(DEBUG ON "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +else() + kokkos_enable_option(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +endif() +unset(_UPPERCASE_CMAKE_BUILD_TYPE) +kokkos_enable_option(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +kokkos_enable_option(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +kokkos_enable_option(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +kokkos_enable_option(TUNING OFF "Whether to create bindings for tuning tools") +kokkos_enable_option(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +kokkos_enable_option(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") +kokkos_enable_option( + HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF + "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time" +) +kokkos_enable_option(IMPL_HIP_MALLOC_ASYNC OFF "Whether to enable hipMallocAsync") +kokkos_enable_option(OPENACC_FORCE_HOST_AS_DEVICE OFF "Whether to force to use host as a target device for OpenACC") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") - -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") -KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") -KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") +kokkos_enable_option(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +kokkos_enable_option( + ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases" +) +kokkos_enable_option( + IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting" +) +mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) +kokkos_enable_option( + IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF + "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction." +) +mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) + +kokkos_enable_option(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") +kokkos_enable_option(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") +kokkos_enable_option( + IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan" +) mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos) - SET(COMPLEX_ALIGN_DEFAULT OFF) -ELSE() - SET(COMPLEX_ALIGN_DEFAULT ON) -ENDIF() -KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") - -IF (KOKKOS_ENABLE_TESTS) - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) -ELSE() - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") -IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) - MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) - SET(CUDA_CONSTEXPR_DEFAULT ON) -ELSE() - SET(CUDA_CONSTEXPR_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") - -IF (KOKKOS_ENABLE_HPX) - SET(HPX_ASYNC_DISPATCH_DEFAULT ON) -ELSE() - SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") - -Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") - -FUNCTION(check_device_specific_options) - CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) - IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) - FOREACH(OPTION ${SOME_OPTIONS}) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() - IF(KOKKOS_ENABLE_${OPTION}) - MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") - UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() - -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +kokkos_enable_option(COMPLEX_ALIGN ON "Whether to align Kokkos::complex to 2*alignof(RealType)") + +if(KOKKOS_ENABLE_TESTS) + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +else() + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +endif() +kokkos_enable_option( + HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests" +) +if(NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + message( + WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored." + ) +endif() + +if(KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + set(CUDA_CONSTEXPR_DEFAULT ON) +else() + set(CUDA_CONSTEXPR_DEFAULT OFF) +endif() +kokkos_enable_option( + CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions" +) + +if(KOKKOS_ENABLE_HPX) + set(HPX_ASYNC_DISPATCH_DEFAULT ON) +else() + set(HPX_ASYNC_DISPATCH_DEFAULT OFF) +endif() +kokkos_enable_option(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") + +kokkos_enable_option(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + +function(check_device_specific_options) + cmake_parse_arguments(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + if(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + foreach(OPTION ${SOME_OPTIONS}) + if(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + message(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + endif() + if(KOKKOS_ENABLE_${OPTION}) + message( + WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored." + ) + unset(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + endif() + endforeach() + endif() +endfunction() + +check_device_specific_options( + DEVICE + CUDA + OPTIONS + CUDA_UVM + CUDA_RELOCATABLE_DEVICE_CODE + CUDA_LAMBDA + CUDA_CONSTEXPR + CUDA_LDG_INTRINSIC + IMPL_CUDA_MALLOC_ASYNC + IMPL_CUDA_UNIFIED_MEMORY +) +check_device_specific_options( + DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE HIP_MULTIPLE_KERNEL_INSTANTIATIONS IMPL_HIP_MALLOC_ASYNC +) +check_device_specific_options(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options(DEVICE OPENACC OPTIONS OPENACC_FORCE_HOST_AS_DEVICE) # Needed due to change from deprecated name to new header define name -IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) -ENDIF() +if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + set(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +endif() # Force consistency of KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE # and CMAKE_CUDA_SEPARABLE_COMPILATION when we are compiling # using the CMake CUDA language support. # Either one being on will turn the other one on. -IF (KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - IF (NOT CMAKE_CUDA_SEPARABLE_COMPILATION) - MESSAGE(STATUS "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support") - SET(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - ELSE() - IF (CMAKE_CUDA_SEPARABLE_COMPILATION) - SET(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) - ENDIF() - ENDIF() -ENDIF() +if(KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + if(NOT CMAKE_CUDA_SEPARABLE_COMPILATION) + message( + STATUS + "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support" + ) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + endif() + else() + if(CMAKE_CUDA_SEPARABLE_COMPILATION) + set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) + endif() + endif() +endif() # This is known to occur with Clang 9. We would need to use nvcc as the linker # http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html # TODO: Through great effort we can use a different linker by hacking # CMAKE_CXX_LINK_EXECUTABLE in a future release -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") -ENDIF() - -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) - MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") -ENDIF() - -IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - ENDIF() -ENDIF() -IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + message( + FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC" + ) +endif() + +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + message(FATAL_ERROR "Relocatable device code requires static libraries.") +endif() + +if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") + endif() +endif() +if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" + ) set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - ENDIF() -ENDIF() - - -IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) - MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") -ENDIF() + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + endif() +endif() + +if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +endif() diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake index 9dab1ca00ea4aced372c28711c07161c63e58e32..38eedd8362c559c3b73e6e99057ecb96a8be0ac7 100644 --- a/packages/kokkos/cmake/kokkos_functions.cmake +++ b/packages/kokkos/cmake/kokkos_functions.cmake @@ -5,12 +5,8 @@ # Validate options are given with correct case and define an internal # upper-case version for use within -set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_BENCHMARKS - Kokkos_ENABLE_EXAMPLES - Kokkos_ENABLE_TESTS - Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS - Kokkos_ENABLE_COMPILER_WARNINGS +set(Kokkos_OPTIONS_NOT_TO_EXPORT Kokkos_ENABLE_BENCHMARKS Kokkos_ENABLE_EXAMPLES Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS Kokkos_ENABLE_COMPILER_WARNINGS ) # @@ -22,139 +18,122 @@ set(Kokkos_OPTIONS_NOT_TO_EXPORT # It attempts to print a helpful message about updating the options for the new CMake. # Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. # Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... -FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) - SET(CAMEL_NAME Kokkos_${SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_deprecated_list SUFFIX PREFIX) + set(CAMEL_NAME Kokkos_${SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) #I don't love doing it this way but better to be safe - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - STRING(REPLACE "," ";" optlist "${${opt}}") - SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") - ENDFOREACH() - STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") - IF (KOKKOS_HAS_TRILINOS) - MESSAGE(WARNING ${ERROR_MSG}) - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") - ENDFOREACH() - UNSET(${opt} CACHE) - ELSE() - MESSAGE(SEND_ERROR ${ERROR_MSG}) - ENDIF() - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + string(REPLACE "," ";" optlist "${${opt}}") + set(ERROR_MSG + "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:" + ) + foreach(entry ${optlist}) + string(TOUPPER ${entry} ENTRY_UC) + string(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + endforeach() + string( + APPEND + ERROR_MSG + "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it)." + ) + message(SEND_ERROR ${ERROR_MSG}) + endif() + endforeach() +endfunction() + +function(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) # Make sure this appears in the cache with the appropriate DOCSTRING - SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) - - IF (KOKKOS_HAS_TRILINOS) - IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) - ENDIF() - ENDIF() + set(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -INCLUDE (CMakeDependentOption) -FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES BOOL) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - - CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +include(CMakeDependentOption) +function(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES BOOL) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + + cmake_dependent_option(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - -FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) - LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) - IF(OPTION_INDEX EQUAL -1) - MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") - ENDIF() - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) - - LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) - LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) - SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) - MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") - SET(${UC_NAME} ${VALUE} PARENT_SCOPE) -ENDFUNCTION() - -FUNCTION(kokkos_append_config_line LINE) - GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") -ENDFUNCTION() - -MACRO(kokkos_export_cmake_tpl NAME) + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() + +function(kokkos_set_option CAMEL_SUFFIX VALUE) + list(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + if(OPTION_INDEX EQUAL -1) + message(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + endif() + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) + + list(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + list(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + set(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + message(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + set(${UC_NAME} ${VALUE} PARENT_SCOPE) +endfunction() + +function(kokkos_append_config_line LINE) + global_append(KOKKOS_TPL_EXPORTS "${LINE}") +endfunction() + +macro(kokkos_export_cmake_tpl NAME) cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN}) #CMake TPLs are located with a call to find_package @@ -163,91 +142,88 @@ MACRO(kokkos_export_cmake_tpl NAME) #If Kokkos was configured to find the TPL through a _DIR variable #make sure thar DIR variable is available to downstream packages - IF (DEFINED ${NAME}_DIR) + if(DEFINED ${NAME}_DIR) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_DIR)") + kokkos_append_config_line(" SET(${NAME}_DIR ${${NAME}_DIR})") + kokkos_append_config_line("ENDIF()") + endif() - IF (DEFINED ${NAME}_ROOT) + if(DEFINED ${NAME}_ROOT) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") - - IF(KOKKOS_EXTRA_ARG_REQUIRED) - STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED") - ENDIF() - IF(KOKKOS_EXTRA_ARG_COMPONENTS) - STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") - ENDIF() - STRING(APPEND KOKKOS_CONFIG_STRING ")") - KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING}) -ENDMACRO() - -MACRO(kokkos_export_imported_tpl NAME) - IF (NOT KOKKOS_HAS_TRILINOS) - GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) - IF (NOT LIB_IMPORTED) - # This is not an imported target - # This an interface library that we created - INSTALL( - TARGETS ${NAME} - EXPORT KokkosTargets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - ELSE() - #make sure this also gets "exported" in the config file - KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") - - GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) - IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - ELSE() - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) - IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") - ENDIF() - ENDIF() - - GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) - IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") - ENDIF() - - GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) - IF(TPL_COMPILE_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") - ENDIF() - - SET(TPL_LINK_OPTIONS) - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - IF(TPL_LINK_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") - ENDIF() - - GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) - IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") - ENDIF() - KOKKOS_APPEND_CONFIG_LINE(")") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - ENDIF() -ENDMACRO() - + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_ROOT)") + kokkos_append_config_line(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + kokkos_append_config_line("ENDIF()") + endif() + set(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") + + if(KOKKOS_EXTRA_ARG_REQUIRED) + string(APPEND KOKKOS_CONFIG_STRING " REQUIRED") + endif() + if(KOKKOS_EXTRA_ARG_COMPONENTS) + string(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") + endif() + string(APPEND KOKKOS_CONFIG_STRING ")") + kokkos_append_config_line(${KOKKOS_CONFIG_STRING}) +endmacro() + +macro(kokkos_export_imported_tpl NAME) + get_target_property(LIB_IMPORTED ${NAME} IMPORTED) + if(NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + install( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + #make sure this also gets "exported" in the config file + kokkos_append_config_line("IF(NOT TARGET ${NAME})") + + get_target_property(LIB_TYPE ${NAME} TYPE) + if(${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + kokkos_append_config_line("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + else() + kokkos_append_config_line("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + get_target_property(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + if(TPL_LIBRARY) + kokkos_append_config_line("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + endif() + endif() + + get_target_property(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + if(TPL_INCLUDES) + kokkos_append_config_line("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + endif() + + get_target_property(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + if(TPL_COMPILE_OPTIONS) + kokkos_append_config_line("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + endif() + + set(TPL_LINK_OPTIONS) + get_target_property(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + if(TPL_LINK_OPTIONS) + kokkos_append_config_line("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + endif() + + get_target_property(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + if(TPL_LINK_LIBRARIES) + kokkos_append_config_line("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + endif() + kokkos_append_config_line(")") + kokkos_append_config_line("ENDIF()") + endif() +endmacro() # # @MACRO: KOKKOS_IMPORT_TPL() @@ -271,57 +247,43 @@ ENDMACRO() # # If specified, this TPL will build an INTERFACE library rather than an # IMPORTED target -IF (KOKKOS_HAS_TRILINOS) -MACRO(kokkos_import_tpl NAME) - #do nothing -ENDMACRO() -ELSE() -MACRO(kokkos_import_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT;INTERFACE" - "" - "" - ${ARGN}) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - - IF (KOKKOS_ENABLE_${NAME}) +macro(kokkos_import_tpl NAME) + cmake_parse_arguments(TPL "NO_EXPORT;INTERFACE" "" "" ${ARGN}) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + + if(KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find - FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) - IF(NOT TARGET ${TPL_IMPORTED_NAME}) - MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") - ENDIF() - IF(NOT TPL_NO_EXPORT) - GET_TARGET_PROPERTY(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) - IF (NOT TPL_ORIGINAL_NAME) - SET(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) - ENDIF() - KOKKOS_EXPORT_IMPORTED_TPL(${TPL_ORIGINAL_NAME}) - ENDIF() - LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) - ENDIF() -ENDMACRO(kokkos_import_tpl) -ENDIF() - -MACRO(kokkos_import_cmake_tpl MODULE_NAME) + find_package(TPL${NAME} REQUIRED MODULE) + if(NOT TARGET ${TPL_IMPORTED_NAME}) + message(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + endif() + if(NOT TPL_NO_EXPORT) + get_target_property(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) + if(NOT TPL_ORIGINAL_NAME) + set(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) + endif() + kokkos_export_imported_tpl(${TPL_ORIGINAL_NAME}) + endif() + list(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + endif() +endmacro(kokkos_import_tpl) + +macro(kokkos_import_cmake_tpl MODULE_NAME) kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT" - "OPTION_NAME" - "" - ${ARGN}) + cmake_parse_arguments(TPL "NO_EXPORT" "OPTION_NAME" "" ${ARGN}) - IF (NOT TPL_OPTION_NAME) - SET(TPL_OPTION_NAME ${MODULE_NAME}) - ENDIF() + if(NOT TPL_OPTION_NAME) + set(TPL_OPTION_NAME ${MODULE_NAME}) + endif() - IF (NOT TPL_NO_EXPORT) - KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) - ENDIF() -ENDMACRO() + if(NOT TPL_NO_EXPORT) + kokkos_export_cmake_tpl(${MODULE_NAME}) + endif() +endmacro() # # @MACRO: KOKKOS_CREATE_IMPORTED_TPL() @@ -368,68 +330,57 @@ ENDMACRO() # # If specified, this gives a list of linker flags that must be used # for using this library. -MACRO(kokkos_create_imported_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE" - "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" - ${ARGN}) - - - IF (KOKKOS_HAS_TRILINOS) - #TODO: we need to set a bunch of cache variables here - ELSEIF (TPL_INTERFACE) - ADD_LIBRARY(${NAME} INTERFACE) +macro(kokkos_create_imported_tpl NAME) + cmake_parse_arguments( + TPL "INTERFACE" "LIBRARY" "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN} + ) + + if(TPL_INTERFACE) + add_library(${NAME} INTERFACE) #Give this an importy-looking name - ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) - IF (TPL_LIBRARY) - MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") - ENDIF() + add_library(Kokkos::${NAME} ALIAS ${NAME}) + if(TPL_LIBRARY) + message(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) - ENDIF() - IF(TPL_INCLUDES) - TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) - ENDIF() - IF(TPL_COMPILE_OPTIONS) - TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) - ENDIF() - IF(TPL_LINK_OPTIONS) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) - ENDIF() - ELSE() - ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) - IF(TPL_LIBRARY) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - IMPORTED_LOCATION ${TPL_LIBRARY}) - ENDIF() + if(TPL_LINK_LIBRARIES) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + endif() + if(TPL_INCLUDES) + target_include_directories(${NAME} INTERFACE ${TPL_INCLUDES}) + endif() + if(TPL_COMPILE_DEFINITIONS) + target_compile_definitions(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + endif() + if(TPL_COMPILE_OPTIONS) + target_compile_options(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + endif() + if(TPL_LINK_OPTIONS) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + endif() + else() + add_library(${NAME} UNKNOWN IMPORTED) + if(TPL_LIBRARY) + set_target_properties(${NAME} PROPERTIES IMPORTED_LOCATION ${TPL_LIBRARY}) + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") - ENDIF() - IF(TPL_INCLUDES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") - ENDIF() - IF(TPL_COMPILE_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") - ENDIF() - IF(TPL_LINK_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") - ENDIF() - ENDIF() -ENDMACRO() + if(TPL_LINK_LIBRARIES) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + endif() + if(TPL_INCLUDES) + set_target_properties(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + endif() + if(TPL_COMPILE_DEFINITIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + endif() + if(TPL_COMPILE_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + endif() + if(TPL_LINK_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + endif() + endif() +endmacro() # # @MACRO: KOKKOS_FIND_HEADER @@ -479,37 +430,32 @@ ENDMACRO() # # Custom paths to search for the header # -MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS" - ${ARGN}) - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_PATH(${VAR_NAME} ${HEADER} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} +macro(kokkos_find_header VAR_NAME HEADER TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS" ${ARGN}) + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_path( + ${VAR_NAME} ${HEADER} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} PATH_SUFFIXES include - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_PATH(${VAR_NAME} ${HEADER}) - ENDIF() + find_path(${VAR_NAME} ${HEADER}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_LIBRARY @@ -565,42 +511,36 @@ ENDMACRO() # Suffixes appended to PATHS when attempting to locate # the library. Defaults to {lib, lib64}. # -MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS;SUFFIXES" - ${ARGN}) - - IF(NOT TPL_SUFFIXES) - SET(TPL_SUFFIXES lib lib64) - ENDIF() - - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) - - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_LIBRARY(${VAR_NAME} ${LIB} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} - PATH_SUFFIXES - ${TPL_SUFFIXES} - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() - - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) +macro(kokkos_find_library VAR_NAME LIB TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS;SUFFIXES" ${ARGN}) + + if(NOT TPL_SUFFIXES) + set(TPL_SUFFIXES lib lib64) + endif() + + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) + + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_library( + ${VAR_NAME} ${LIB} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} + PATH_SUFFIXES ${TPL_SUFFIXES} + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() + + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) - ENDIF() + find_library(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_IMPORTED @@ -683,106 +623,127 @@ ENDMACRO() # If specified, this gives a list of paths to search for the headers # If not given, <NAME>_ROOT/include and <NAME>_ROOT/include will be searched. # -MACRO(kokkos_find_imported NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" - "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" - "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" - ${ARGN}) - - IF(NOT TPL_MODULE_NAME) - SET(TPL_MODULE_NAME TPL${NAME}) - ENDIF() - - IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) - SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) - ELSE() - SET(ALLOW_PATH_FALLBACK_OPT) - ENDIF() - - IF (NOT TPL_IMPORTED_NAME) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - ENDIF() - - IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib lib64) - ENDIF() - - SET(${NAME}_INCLUDE_DIRS) - IF (TPL_HEADER) - KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - ENDIF() - - FOREACH(HEADER ${TPL_HEADERS}) - KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - IF(HEADER_FIND_TEMP) - LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) - ENDIF() - ENDFOREACH() - - SET(${NAME}_LIBRARY) - IF(TPL_LIBRARY) - KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} +macro(kokkos_find_imported NAME) + cmake_parse_arguments( + TPL "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" ${ARGN} + ) + + if(NOT TPL_MODULE_NAME) + set(TPL_MODULE_NAME TPL${NAME}) + endif() + + if(TPL_ALLOW_SYSTEM_PATH_FALLBACK) + set(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + else() + set(ALLOW_PATH_FALLBACK_OPT) + endif() + + if(NOT TPL_IMPORTED_NAME) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + endif() + + if(NOT TPL_LIBRARY_SUFFIXES) + set(TPL_LIBRARY_SUFFIXES lib) + if(KOKKOS_IMPL_32BIT) + list(APPEND TPL_LIBRARY_SUFFIXES lib32) + else() + list(APPEND TPL_LIBRARY_SUFFIXES lib64) + endif() + endif() + + set(${NAME}_INCLUDE_DIRS) + if(TPL_HEADER) + kokkos_find_header(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + endif() + + foreach(HEADER ${TPL_HEADERS}) + kokkos_find_header(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + if(HEADER_FIND_TEMP) + list(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + endif() + endforeach() + + set(${NAME}_LIBRARY) + if(TPL_LIBRARY) + kokkos_find_library( + ${NAME}_LIBRARY + ${TPL_LIBRARY} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - ENDIF() - - SET(${NAME}_FOUND_LIBRARIES) - FOREACH(LIB ${TPL_LIBRARIES}) - KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + endif() + + set(${NAME}_FOUND_LIBRARIES) + foreach(LIB ${TPL_LIBRARIES}) + kokkos_find_library( + ${LIB}_LOCATION + ${LIB} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - IF(${LIB}_LOCATION) - LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - ELSE() - SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - BREAK() - ENDIF() - ENDFOREACH() - - INCLUDE(FindPackageHandleStandardArgs) + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + if(${LIB}_LOCATION) + list(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + else() + set(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + break() + endif() + endforeach() + + include(FindPackageHandleStandardArgs) #Collect all the variables we need to be valid for #find_package to have succeeded - SET(TPL_VARS_NEEDED) - IF (TPL_LIBRARY) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) - ENDIF() - IF(TPL_HEADER) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) - ENDIF() - IF(TPL_LIBRARIES) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) - ENDIF() - FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) - - MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + set(TPL_VARS_NEEDED) + if(TPL_LIBRARY) + list(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + endif() + if(TPL_HEADER) + list(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + endif() + if(TPL_LIBRARIES) + list(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + endif() + find_package_handle_standard_args(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + + mark_as_advanced(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) #this is so much fun on a Cray system #/usr/include should never be added as a -isystem include #this freaks out the compiler include search order - IF (KOKKOS_IS_CRAYPE) - LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") - ENDIF() - - IF (${TPL_MODULE_NAME}_FOUND) - SET(IMPORT_TYPE) - IF (TPL_INTERFACE) - SET(IMPORT_TYPE "INTERFACE") - SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) - ENDIF() - KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + if(KOKKOS_IS_CRAYPE) + list(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + endif() + + if(${TPL_MODULE_NAME}_FOUND) + set(IMPORT_TYPE) + if(TPL_INTERFACE) + set(IMPORT_TYPE "INTERFACE") + set(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) + endif() + kokkos_create_imported_tpl( + ${TPL_IMPORTED_NAME} ${IMPORT_TYPE} - INCLUDES "${${NAME}_INCLUDE_DIRS}" - LIBRARY "${${NAME}_LIBRARY}" - LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") - ENDIF() -ENDMACRO(kokkos_find_imported) + INCLUDES + "${${NAME}_INCLUDE_DIRS}" + LIBRARY + "${${NAME}_LIBRARY}" + LINK_LIBRARIES + "${${NAME}_FOUND_LIBRARIES}" + ) + endif() +endmacro(kokkos_find_imported) # # @MACRO: KOKKOS_LINK_TPL() @@ -812,109 +773,114 @@ ENDMACRO(kokkos_find_imported) # If specified, this gives the exact name of the target to link against # target_link_libraries(<TARGET> <IMPORTED_NAME>) # -FUNCTION(kokkos_link_tpl TARGET) - CMAKE_PARSE_ARGUMENTS(TPL - "PUBLIC;PRIVATE;INTERFACE" - "IMPORTED_NAME" - "" - ${ARGN}) +function(kokkos_link_tpl TARGET) + cmake_parse_arguments(TPL "PUBLIC;PRIVATE;INTERFACE" "IMPORTED_NAME" "" ${ARGN}) #the name of the TPL - SET(TPL ${TPL_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - #Do nothing, they will have already been linked - ELSE() - IF (NOT TPL_IMPORTED_NAME) - SET(TPL_IMPORTED_NAME Kokkos::${TPL}) - ENDIF() - IF (KOKKOS_ENABLE_${TPL}) - IF (TPL_PUBLIC) - TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_PRIVATE) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_INTERFACE) - TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) - ELSE() - TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() - -FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA NVHPC DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu MSVC) - CMAKE_PARSE_ARGUMENTS( - PARSE - "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" - "COMPILER_ID" - "${COMPILERS}" - ${ARGN}) - IF(PARSE_UNPARSED_ARGUMENTS) - MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") - ENDIF() - - IF(PARSE_COMPILER_ID) - SET(COMPILER ${${PARSE_COMPILER_ID}}) - ELSE() - SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) - ENDIF() - - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) - FOREACH(COMP ${COMPILERS}) - IF (COMPILER STREQUAL "${COMP}") - IF (PARSE_${COMPILER}) - IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") - SET(COMPILER_SPECIFIC_FLAGS_TMP "") - ELSE() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() - - IF (PARSE_COMPILE_OPTIONS) + set(TPL ${TPL_UNPARSED_ARGUMENTS}) + if(NOT TPL_IMPORTED_NAME) + set(TPL_IMPORTED_NAME Kokkos::${TPL}) + endif() + if(KOKKOS_ENABLE_${TPL}) + if(TPL_PUBLIC) + target_link_libraries(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + elseif(TPL_PRIVATE) + target_link_libraries(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + elseif(TPL_INTERFACE) + target_link_libraries(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + else() + target_link_libraries(${TARGET} ${TPL_IMPORTED_NAME}) + endif() + endif() +endfunction() + +function(COMPILER_SPECIFIC_OPTIONS_HELPER) + set(COMPILERS + NVIDIA + NVHPC + DEFAULT + Cray + Intel + Clang + AppleClang + IntelLLVM + GNU + HIPCC + Fujitsu + MSVC + CrayClang + ) + cmake_parse_arguments( + PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" "COMPILER_ID" "${COMPILERS}" ${ARGN} + ) + if(PARSE_UNPARSED_ARGUMENTS) + message( + SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options" + ) + endif() + + if(PARSE_COMPILER_ID) + set(COMPILER ${${PARSE_COMPILER_ID}}) + else() + set(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + endif() + + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + foreach(COMP ${COMPILERS}) + if(COMPILER STREQUAL "${COMP}") + if(PARSE_${COMPILER}) + if("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + set(COMPILER_SPECIFIC_FLAGS_TMP "") + else() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + endif() + endif() + endif() + endforeach() + + if(PARSE_COMPILE_OPTIONS) # The funky logic here is for future handling of argument deduplication # If we naively pass multiple -Xcompiler flags to target_compile_options # -Xcompiler will get deduplicated and break the build - IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) - LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") - GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ELSE() - GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - ENDIF() - - IF (PARSE_LINK_OPTIONS) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_COMPILE_DEFINITIONS) - GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - - IF (PARSE_LINK_LIBRARIES) - GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - -FUNCTION(COMPILER_SPECIFIC_FLAGS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) - -FUNCTION(COMPILER_SPECIFIC_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - -FUNCTION(COMPILER_SPECIFIC_DEFS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) -ENDFUNCTION(COMPILER_SPECIFIC_DEFS) - -FUNCTION(COMPILER_SPECIFIC_LIBS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) -ENDFUNCTION(COMPILER_SPECIFIC_LIBS) + if("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + list(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + global_append(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + else() + global_append(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + endif() + + if(PARSE_LINK_OPTIONS) + global_append(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_COMPILE_DEFINITIONS) + global_append(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + + if(PARSE_LINK_LIBRARIES) + global_append(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() +endfunction(COMPILER_SPECIFIC_OPTIONS_HELPER) + +function(COMPILER_SPECIFIC_FLAGS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_FLAGS) + +function(COMPILER_SPECIFIC_OPTIONS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS) +endfunction(COMPILER_SPECIFIC_OPTIONS) + +function(COMPILER_SPECIFIC_LINK_OPTIONS) + compiler_specific_options_helper(${ARGN} LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_LINK_OPTIONS) + +function(COMPILER_SPECIFIC_DEFS) + compiler_specific_options_helper(${ARGN} COMPILE_DEFINITIONS) +endfunction(COMPILER_SPECIFIC_DEFS) + +function(COMPILER_SPECIFIC_LIBS) + compiler_specific_options_helper(${ARGN} LINK_LIBRARIES) +endfunction(COMPILER_SPECIFIC_LIBS) # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -922,41 +888,42 @@ ENDFUNCTION(COMPILER_SPECIFIC_LIBS) # kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) # would produce a list variable ALL_ARCHES=key1;key2 # and individual variables ARCHkey1=value1 and ARCHkey2=value2 -MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) - SET(PARSE_KEY ON) - SET(${KEY_LIST_NAME}) - FOREACH(ENTRY ${ARGN}) - IF(PARSE_KEY) - SET(CURRENT_KEY ${ENTRY}) - SET(PARSE_KEY OFF) - LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) - ELSE() - SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) - SET(PARSE_KEY ON) - ENDIF() - ENDFOREACH() -ENDMACRO() - -FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) - KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) - FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) - SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) - SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) - IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off - MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") - ENDIF() - ENDFOREACH() -ENDFUNCTION() +macro(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + set(PARSE_KEY ON) + set(${KEY_LIST_NAME}) + foreach(ENTRY ${ARGN}) + if(PARSE_KEY) + set(CURRENT_KEY ${ENTRY}) + set(PARSE_KEY OFF) + list(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + else() + set(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + set(PARSE_KEY ON) + endif() + endforeach() +endmacro() + +function(KOKKOS_CHECK_DEPRECATED_OPTIONS) + kokkos_key_value_map(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + foreach(OPTION_SUFFIX ${DEPRECATED_LIST}) + set(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + set(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + if(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + message(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + endif() + endforeach() +endfunction() # this function checks whether the current CXX compiler supports building CUDA -FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) - # don't run this test every time - IF(DEFINED ${_VAR}) - RETURN() - ENDIF() - - FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp -" +function(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + if(DEFINED ${_VAR}) + return() + endif() + + file( + WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp + " #include <cuda.h> #include <cstdlib> @@ -980,14 +947,13 @@ int main() cudaDeviceSynchronize(); return EXIT_SUCCESS; } -") +" + ) - TRY_COMPILE(_RET - ${PROJECT_BINARY_DIR}/compile_tests - SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + try_compile(_RET ${PROJECT_BINARY_DIR}/compile_tests SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) - SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") -ENDFUNCTION() + set(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +endfunction() # this function is provided to easily select which files use nvcc_wrapper: # @@ -1000,58 +966,77 @@ ENDFUNCTION() # NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. # This version explicitly uses nvcc_wrapper. # -FUNCTION(kokkos_compilation) - # check whether the compiler already supports building CUDA - KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) - # if CUDA compile test has already been performed, just return - IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) - RETURN() - ENDIF() - - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() - - # find nvcc_wrapper - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") - ENDIF() - - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() +function(kokkos_compilation) + # check whether the compiler already supports building CUDA + kokkos_cxx_compiler_cuda_test(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + if(Kokkos_CXX_COMPILER_COMPILES_CUDA) + return() + endif() + + cmake_parse_arguments(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR + "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'" + ) + endif() + + # find nvcc_wrapper + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) + + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'" + ) + endif() + + if(COMP_GLOBAL) + # if global, don't bother setting others + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + else() + foreach(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + if("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + list(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + unset(COMP_${_TYPE}) + endif() + # set the properties if defined + if(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + endif() + endforeach() + endif() +endfunction() ## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names ## and create output config header file...used for ## creating dynamic include files based on enabled backends @@ -1061,14 +1046,15 @@ ENDFUNCTION() ## HEADER_GUARD TEXT used with include header guard ## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) ## DATA_LIST list of backends to include in generated file -FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) - SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") - CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) - FOREACH( BACKEND_NAME ${DATA_LIST} ) - SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> -\@INCLUDE_NEXT_FILE\@") - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) - ENDFOREACH() - SET(INCLUDE_NEXT_FILE "" ) - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) -ENDFUNCTION() +function(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + set(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + configure_file(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + foreach(BACKEND_NAME ${DATA_LIST}) + set(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@" + ) + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + endforeach() + set(INCLUDE_NEXT_FILE "") + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +endfunction() diff --git a/packages/kokkos/cmake/kokkos_install.cmake b/packages/kokkos/cmake/kokkos_install.cmake index f818dfa24485b7e22f46d364a15fe07ede71cb71..3ae7570ffea5fcb1c9b1e76e33162c4dd88d39ab 100644 --- a/packages/kokkos/cmake/kokkos_install.cmake +++ b/packages/kokkos/cmake/kokkos_install.cmake @@ -1,57 +1,51 @@ -INCLUDE(CMakePackageConfigHelpers) -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - INCLUDE(GNUInstallDirs) +include(CMakePackageConfigHelpers) +if(NOT Kokkos_INSTALL_TESTING) + include(GNUInstallDirs) #Set all the variables needed for KokkosConfig.cmake - GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + get_property(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + set(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) - INCLUDE(CMakePackageConfigHelpers) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + include(CMakePackageConfigHelpers) + configure_package_config_file( + cmake/KokkosConfig.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfigCommon.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + configure_package_config_file( + cmake/KokkosConfigCommon.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) # Install the KokkosConfig*.cmake files - install(FILES - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(FILES "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos + ) install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake) # Required to be a TriBITS-compliant external package file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake) -ELSE() - CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") - CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") - - WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake + ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos + ) + file(WRITE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake + "include(${Kokkos_BINARY_DIR}/KokkosTargets.cmake)" + ) +else() + configure_file(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) + + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") -ENDIF() - -INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/Kokkos" + ) +endif() +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) diff --git a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake index d4eca651d423fd7567b25ac8fdf8af6985dee050..0d31e6d131f7f9e39dfae5856d632b154321f163 100644 --- a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,19 +1,28 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17") +kokkos_option( + CXX_STANDARD + "" + STRING + "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17" +) # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX17 OFF) -SET(KOKKOS_ENABLE_CXX20 OFF) -SET(KOKKOS_ENABLE_CXX23 OFF) -IF (KOKKOS_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") -ENDIF() +set(KOKKOS_ENABLE_CXX17 OFF) +set(KOKKOS_ENABLE_CXX20 OFF) +set(KOKKOS_ENABLE_CXX23 OFF) +set(KOKKOS_ENABLE_CXX26 OFF) +if(KOKKOS_CXX_STANDARD) + message( + FATAL_ERROR + "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" + ) +endif() -IF (NOT CMAKE_CXX_STANDARD) - SET(KOKKOS_CXX_STANDARD "17") -ELSE() - SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -ENDIF() -MESSAGE(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") +if(NOT CMAKE_CXX_STANDARD) + set(KOKKOS_CXX_STANDARD "17") +else() + set(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +endif() +message(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake index 7ad49fdd2d9df05a2a5bbdf62eda5d5a38a622dd..a84e714064df954613989dee5ae9697b885d56e9 100644 --- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -1,97 +1,112 @@ -KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) +kokkos_cfg_depends(CXX_STD COMPILER_ID) -FUNCTION(kokkos_set_cxx_standard_feature standard) - SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) - SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) - SET(FEATURE_NAME cxx_std_${standard}) +function(kokkos_set_cxx_standard_feature standard) + set(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + set(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + set(FEATURE_NAME cxx_std_${standard}) #CMake's way of telling us that the standard (or extension) #flags are supported is the extension/standard variables - IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSEIF(CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSE() - #For trilinos, we need to make sure downstream projects - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ENDIF() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + global_set(KOKKOS_USE_CXX_EXTENSIONS OFF) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + elseif(CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + message( + FATAL_ERROR + "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue" + ) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + endif() - IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) - MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) - MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") - IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") - ELSE() - SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") - ENDIF() - IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) - MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") - ENDIF() - ENDIF() - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + if(KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + message(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + message(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU + OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang) + ) + if(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + set(SUPPORTED_NVCC_FLAGS "-std=c++17") + else() + set(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + endif() + if(NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + message( + FATAL_ERROR + "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help." + ) + endif() + endif() + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") #MSVC doesn't need a command line flag, that doesn't mean it has no support - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSE() + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu") + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + else() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ENDIF() + message( + WARNING + "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command." + ) + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + endif() - IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) - MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") - ENDIF() - ENDIF() -ENDFUNCTION() + if((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + if(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message( + FATAL_ERROR + "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported" + ) + endif() + endif() +endfunction() -IF(KOKKOS_CXX_STANDARD STREQUAL "17") +if(KOKKOS_CXX_STANDARD STREQUAL "17") kokkos_set_cxx_standard_feature(17) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") - SET(KOKKOS_ENABLE_CXX17 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + set(KOKKOS_ENABLE_CXX17 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") - SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + set(KOKKOS_ENABLE_CXX20 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") - SET(KOKKOS_ENABLE_CXX23 ON) -ELSE() - MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") -ENDIF() + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") + set(KOKKOS_ENABLE_CXX23 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "26") + kokkos_set_cxx_standard_feature(26) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + set(KOKKOS_ENABLE_CXX26 ON) +else() + message(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") +endif() # Enforce that we can compile a simple C++17 program -TRY_COMPILE(CAN_COMPILE_CPP17 - ${KOKKOS_TOP_BUILD_DIR}/corner_cases - ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp - OUTPUT_VARIABLE ERROR_MESSAGE - CXX_STANDARD 17 +try_compile( + CAN_COMPILE_CPP17 ${KOKKOS_TOP_BUILD_DIR}/corner_cases ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp + OUTPUT_VARIABLE ERROR_MESSAGE CXX_STANDARD 17 ) -if (NOT CAN_COMPILE_CPP17) - UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") -ENDIF() -UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - +if(NOT CAN_COMPILE_CPP17) + unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this + message( + FATAL_ERROR + "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}" + ) +endif() +unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's @@ -101,70 +116,70 @@ UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # that we can only use host compilers for CUDA builds that use those flags. # It also means that extensions (gnu++17) can't be turned on for CUDA builds. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + endif() +endif() -IF(KOKKOS_ENABLE_CUDA) +if(KOKKOS_ENABLE_CUDA) # ENFORCE that the compiler can compile CUDA code. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - ENDIF() - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND NOT (Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. To allow nvc++ as Cuda compiler, Kokkos_ENABLE_IMPL_NVHPC_AS_DEVICE_COMPILER=ON must be set!") - ELSE() - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or NVC++ or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() - ENDIF() -ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message( + FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF" + ) + endif() + elseif(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + message( + FATAL_ERROR + "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}" + ) + endif() +endif() -IF (NOT KOKKOS_CXX_STANDARD_FEATURE) +if(NOT KOKKOS_CXX_STANDARD_FEATURE) #we need to pick the C++ flags ourselves - UNSET(CMAKE_CXX_STANDARD) - UNSET(CMAKE_CXX_STANDARD CACHE) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + unset(CMAKE_CXX_STANDARD) + unset(CMAKE_CXX_STANDARD CACHE) + if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + include(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + include(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + include(${KOKKOS_SRC_PATH}/cmake/intel.cmake) kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + include(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + else() + include(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ENDIF() + endif() #check that the compiler accepts the C++ standard flag - INCLUDE(CheckCXXCompilerFlag) - IF (DEFINED CXX_STD_FLAGS_ACCEPTED) - UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) - ENDIF() - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) - IF (NOT CXX_STD_FLAGS_ACCEPTED) - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) - IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) - MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) - ENDIF() - MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") -ENDIF() - - - - + include(CheckCXXCompilerFlag) + if(DEFINED CXX_STD_FLAGS_ACCEPTED) + unset(CXX_STD_FLAGS_ACCEPTED CACHE) + endif() + check_cxx_compiler_flag("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + if(NOT CXX_STD_FLAGS_ACCEPTED) + check_cxx_compiler_flag("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + if(NOT CXX_INT_STD_FLAGS_ACCEPTED) + message( + FATAL_ERROR + "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}" + ) + endif() + set(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + endif() + message(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +endif() diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake index f124596a84e012a5dca9c94798c05b573d8bf032..f43aff4d1f083920f8a9bf8885cb2e7eac09385d 100644 --- a/packages/kokkos/cmake/kokkos_tpls.cmake +++ b/packages/kokkos/cmake/kokkos_tpls.cmake @@ -1,125 +1,120 @@ -KOKKOS_CFG_DEPENDS(TPLS OPTIONS) -KOKKOS_CFG_DEPENDS(TPLS DEVICES) -KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) +kokkos_cfg_depends(TPLS OPTIONS) +kokkos_cfg_depends(TPLS DEVICES) +kokkos_cfg_depends(TPLS COMPILER_ID) -FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) - CMAKE_PARSE_ARGUMENTS(PARSED - "" - "TRIBITS" - "" - ${ARGN}) +function(KOKKOS_TPL_OPTION PKG DEFAULT) + cmake_parse_arguments(PARSED "" "TRIBITS" "" ${ARGN}) - IF (PARSED_TRIBITS) + if(PARSED_TRIBITS) #this is also a TPL option you can activate with Tribits - IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + if(NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") #Tribits brought its own default that should take precedence - SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) - ENDIF() - ENDIF() + set(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + endif() + endif() - KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") - KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") - SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) - SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + kokkos_enable_option(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + kokkos_option(${PKG}_DIR "" PATH "Location of ${PKG} library") + set(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + set(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) +endfunction() - IF (KOKKOS_HAS_TRILINOS - AND KOKKOS_ENABLE_${PKG} - AND NOT PARSED_TRIBITS) - #this TPL was enabled, but it is not valid to use inside of TriBITS - MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " - "but this can only be enabled in a standalone build") - ENDIF() -ENDFUNCTION() +kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) +kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + set(ROCM_DEFAULT ON) +else() + set(ROCM_DEFAULT OFF) +endif() +if(KOKKOS_ENABLE_HIP) + set(ROCTHRUST_DEFAULT ON) +else() + set(ROCTHRUST_DEFAULT OFF) +endif() +kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) +kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +if(Kokkos_ENABLE_ROCTHRUST) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include <ios> + int main() { + static_assert(_GLIBCXX_RELEASE < 9); + return 0; + } + " + Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + ) +endif() -KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(MEMKIND Off) -IF(KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE ON) -ENDIF() -KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -KOKKOS_TPL_OPTION(LIBRT Off) -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT - KOKKOS_HAS_TRILINOS) - SET(ROCM_DEFAULT ON) -ELSE() - SET(ROCM_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) - SET(ONEDPL_DEFAULT ON) -ELSE() - SET(ONEDPL_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ONEDPL ${ONEDPL_DEFAULT}) +if(KOKKOS_ENABLE_SYCL) + set(ONEDPL_DEFAULT ON) +else() + set(ONEDPL_DEFAULT OFF) +endif() +kokkos_tpl_option(ONEDPL ${ONEDPL_DEFAULT}) -IF (WIN32) - SET(LIBDL_DEFAULT Off) -ELSE() - SET(LIBDL_DEFAULT On) -ENDIF() -KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) +if(WIN32) + set(LIBDL_DEFAULT Off) +else() + set(LIBDL_DEFAULT On) +endif() +kokkos_tpl_option(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) -SET(HPX_DEFAULT ON) -ELSE() -SET(HPX_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) + set(HPX_DEFAULT ON) +else() + set(HPX_DEFAULT OFF) +endif() +kokkos_tpl_option(HPX ${HPX_DEFAULT}) -KOKKOS_TPL_OPTION(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) +kokkos_tpl_option(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) - SET(LIBQUADMATH_DEFAULT ON) -ELSE() - SET(LIBQUADMATH_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + set(LIBQUADMATH_DEFAULT ON) +else() + set(LIBQUADMATH_DEFAULT OFF) +endif() +kokkos_tpl_option(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake -KOKKOS_IMPORT_TPL(HPX INTERFACE) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(CUDA INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBRT) -KOKKOS_IMPORT_TPL(LIBDL) -KOKKOS_IMPORT_TPL(MEMKIND) -IF (NOT WIN32) - KOKKOS_IMPORT_TPL(THREADS INTERFACE) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(ROCM INTERFACE) - KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(LIBQUADMATH) +kokkos_import_tpl(HPX INTERFACE) +kokkos_import_tpl(CUDA INTERFACE) +kokkos_import_tpl(HWLOC) +kokkos_import_tpl(LIBDL) +if(NOT WIN32) + kokkos_import_tpl(THREADS INTERFACE) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_import_tpl(ROCM INTERFACE) +endif() +kokkos_import_tpl(ONEDPL INTERFACE) +kokkos_import_tpl(LIBQUADMATH) +kokkos_import_tpl(ROCTHRUST) -IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) +if(Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) - KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) -ENDIF() + kokkos_export_cmake_tpl(desul REQUIRED COMPONENTS atomics) +endif() -if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) +if(Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) find_package(mdspan REQUIRED) - KOKKOS_EXPORT_CMAKE_TPL(mdspan REQUIRED) + kokkos_export_cmake_tpl(mdspan REQUIRED) endif() -IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) - # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency - # so we just append the flags here instead of linking with the OpenMP target. - IF(KOKKOS_HAS_TRILINOS) - COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) - ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED) - ENDIF() -ENDIF() +if(Kokkos_ENABLE_OPENMP) + find_package(OpenMP 3.0 REQUIRED COMPONENTS CXX) + kokkos_export_cmake_tpl(OpenMP REQUIRED COMPONENTS CXX) + if(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() + if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + endif() +endif() #Convert list to newlines (which CMake doesn't always like in cache variables) -STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +string(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable -UNSET(KOKKOS_TPL_EXPORTS CACHE) -SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) -IF (KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE) - LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) -ENDIF() +unset(KOKKOS_TPL_EXPORTS CACHE) +set(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake index b30ca70ab954bb51dc0cedf95f8b408f31057fe2..2fda803b118193135617aacb1acf4a4790297e7d 100644 --- a/packages/kokkos/cmake/kokkos_tribits.cmake +++ b/packages/kokkos/cmake/kokkos_tribits.cmake @@ -1,82 +1,47 @@ #These are tribits wrappers only ever called by Kokkos itself -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) -INCLUDE(GNUInstallDirs) +include(CMakeParseArguments) +include(CTest) +include(GNUInstallDirs) -MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +message(STATUS "The project name is: ${PROJECT_NAME}") -IF(GTest_FOUND) - SET(KOKKOS_GTEST_LIB GTest::gtest) - MESSAGE(STATUS "Using gtest found in ${GTest_DIR}") -ELSE() # fallback to internal gtest - SET(KOKKOS_GTEST_LIB kokkos_gtest) - MESSAGE(STATUS "Using internal gtest for testing") -ENDIF() +if(GTest_FOUND) + set(KOKKOS_GTEST_LIB GTest::gtest) + message(STATUS "Using gtest found in ${GTest_DIR}") +else() # fallback to internal gtest + set(KOKKOS_GTEST_LIB kokkos_gtest) + message(STATUS "Using internal gtest for testing") +endif() -FUNCTION(VERIFY_EMPTY CONTEXT) +function(VERIFY_EMPTY CONTEXT) if(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + message(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") endif() -ENDFUNCTION() - -#Leave this here for now - but only do for tribits -#This breaks the standalone CMake -IF (KOKKOS_HAS_TRILINOS) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) - SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) - SET(${PROJECT_NAME}_ENABLE_HPX OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) - SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) - SET(${PROJECT_NAME}_ENABLE_TESTS OFF) - ENDIF() - - IF(NOT DEFINED TPL_ENABLE_Pthread) - SET(TPL_ENABLE_Pthread OFF) - ENDIF() -ENDIF() - -MACRO(KOKKOS_PROCESS_SUBPACKAGES) - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) - if (NOT KOKKOS_HAS_TRILINOS) - ADD_SUBDIRECTORY(example) - ADD_SUBDIRECTORY(benchmarks) - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DEF) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DEF() - else() - #do nothing - endif() -ENDMACRO() - -MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) - KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $<INSTALL_INTERFACE:${KOKKOS_HEADER_DIR}>) - - INSTALL( +endfunction() + +macro(KOKKOS_PROCESS_SUBPACKAGES) + add_subdirectory(core) + add_subdirectory(containers) + add_subdirectory(algorithms) + add_subdirectory(simd) + add_subdirectory(example) + add_subdirectory(benchmarks) +endmacro() + +macro(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + kokkos_lib_type(${LIBRARY_NAME} INCTYPE) + target_include_directories(${LIBRARY_NAME} ${INCTYPE} $<INSTALL_INTERFACE:${KOKKOS_HEADER_DIR}>) + + install( TARGETS ${LIBRARY_NAME} EXPORT ${PROJECT_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT ${PACKAGE_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT ${PACKAGE_NAME} ) - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT KokkosTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -84,151 +49,131 @@ MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) - VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDMACRO() + verify_empty(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endmacro() + +function(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "TESTONLY" "" "SOURCES;TESTONLYLIBS" ${ARGN}) -FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) + set_source_files_properties(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + add_executable(${EXE_NAME} ${PARSE_SOURCES}) + if(PARSE_TESTONLYLIBS) + target_link_libraries(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + endif() + verify_empty(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + target_link_libraries(${EXE_NAME} PRIVATE Kokkos::kokkos) +endfunction() + +function(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES;CATEGORIES;ARGS" ${ARGN}) + verify_empty(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + + kokkos_add_test_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES}) + if(PARSE_ARGS) + set(TEST_NUMBER 0) + foreach(ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + string(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + list(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + math(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + kokkos_add_test( + NAME + ${TEST_NAME} + EXE + ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION + " FAILED " + ARGS + ${ARG_STR_LIST} + ) + endforeach() else() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) - - SET_SOURCE_FILES_PROPERTIES(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - #All executables must link to all the kokkos targets - #This is just private linkage because exe is final - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + kokkos_add_test(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED ") + endif() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + if(BUILD_SHARED_LIBS AND NOT ${TEST_NAME}_DISABLE) + set_property(TARGET ${EXE_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + set_property(TARGET ${EXE_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + endif() + if(NOT + (Kokkos_INSTALL_TESTING + OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_HPX + OR Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC")) + ) + if(MSVC) + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "/GR-") + else() + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "-fno-rtti") + endif() + endif() +endfunction() + +function(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + set(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + if(NOT TARGET ${TARGET_NAME}) + message(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") endif() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - - IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) - ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - IF (PARSE_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${PARSE_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - KOKKOS_ADD_TEST(NAME ${TEST_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${ARG_STR_LIST} - ) - ENDFOREACH() - ELSE() - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ) - ENDIF() - ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) - SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - IF (NOT TARGET ${TARGET_NAME}) - MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") - ENDIF() - SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) + set_property(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +endfunction() + +macro(KOKKOS_SETUP_BUILD_ENVIRONMENT) # This is needed for both regular build and install tests - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) #set an internal option, if not already set - SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") - IF (Kokkos_INSTALL_TESTING) - SET(KOKKOS_ENABLE_TESTS ON) - SET(KOKKOS_ENABLE_BENCHMARKS ON) - SET(KOKKOS_ENABLE_EXAMPLES ON) + set(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + if(Kokkos_INSTALL_TESTING) + set(KOKKOS_ENABLE_TESTS ON) + set(KOKKOS_ENABLE_BENCHMARKS ON) + set(KOKKOS_ENABLE_EXAMPLES ON) # This looks a little weird, but what we are doing # is to NOT build Kokkos but instead look for an # installed Kokkos - then build examples and tests # against that installed Kokkos - FIND_PACKAGE(Kokkos REQUIRED) + find_package(Kokkos REQUIRED) # Just grab the configuration from the installation - FOREACH(DEV ${Kokkos_DEVICES}) - SET(KOKKOS_ENABLE_${DEV} ON) - ENDFOREACH() - FOREACH(OPT ${Kokkos_OPTIONS}) - SET(KOKKOS_ENABLE_${OPT} ON) - ENDFOREACH() - FOREACH(TPL ${Kokkos_TPLS}) - SET(KOKKOS_ENABLE_${TPL} ON) - ENDFOREACH() - FOREACH(ARCH ${Kokkos_ARCH}) - SET(KOKKOS_ARCH_${ARCH} ON) - ENDFOREACH() - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) - IF (NOT KOKKOS_HAS_TRILINOS) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - ENDIF() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES" - ${ARGN}) - KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ${PARSE_UNPARSED_ARGUMENTS} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_POSTPROCESS() + foreach(DEV ${Kokkos_DEVICES}) + set(KOKKOS_ENABLE_${DEV} ON) + endforeach() + foreach(OPT ${Kokkos_OPTIONS}) + set(KOKKOS_ENABLE_${OPT} ON) + endforeach() + foreach(TPL ${Kokkos_TPLS}) + set(KOKKOS_ENABLE_${TPL} ON) + endforeach() + foreach(ARCH ${Kokkos_ARCH}) + set(KOKKOS_ARCH_${ARCH} ON) + endforeach() + else() + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + include(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) endif() -ENDMACRO() +endmacro() + +macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) + # Don't do anything if the user disabled the test + if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) + kokkos_add_executable( + ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} + ) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + endif() +endmacro() ## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based ## on enabled backends. @@ -236,307 +181,214 @@ ENDMACRO() ## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp -MACRO(KOKKOS_CONFIGURE_CORE) - SET(FWD_BACKEND_LIST) - FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) - LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) - ENDFOREACH() - FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) - LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) - ENDFOREACH() - MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") - SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") - KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") - KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") - KOKKOS_OPTION(DEFAULT_DEVICE_EXECUTION_SPACE "" STRING "Override default device execution space") - KOKKOS_OPTION(DEFAULT_HOST_PARALLEL_EXECUTION_SPACE "" STRING "Override default host parallel execution space") - IF (NOT Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE STREQUAL "") - SET(_DEVICE_PARALLEL ${Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE}) - MESSAGE(STATUS "Override default device execution space: ${_DEVICE_PARALLEL}") - SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) - ELSE() - IF (_DEVICE_PARALLEL STREQUAL "NoTypeDefined") - SET(KOKKOS_DEVICE_SPACE_ACTIVE OFF) - ELSE() - SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) - ENDIF() - ENDIF() - IF (NOT Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE STREQUAL "") - SET(_HOST_PARALLEL ${Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE}) - MESSAGE(STATUS "Override default host parallel execution space: ${_HOST_PARALLEL}") - SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) - ELSE() - IF (_HOST_PARALLEL STREQUAL "NoTypeDefined") - SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE OFF) - ELSE() - SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) - ENDIF() - ENDIF() - #We are ready to configure the header - CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) -ENDMACRO() +macro(KOKKOS_CONFIGURE_CORE) + message(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" + "${KOKKOS_ENABLED_DEVICES}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" + "${DEVICE_SETUP_LIST}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" + "${KOKKOS_ENABLED_DEVICES}" + ) + configure_file(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +endmacro() ## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. -MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) +macro(KOKKOS_INSTALL_ADDITIONAL_FILES) # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler - IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") - ELSE() - IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") - ENDIF() - ENDIF() - - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler - ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler - @ONLY) - - INSTALL(PROGRAMS - "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" - "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" - "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" - DESTINATION ${CMAKE_INSTALL_BINDIR}) - INSTALL(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + if(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + set(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + else() + if(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + set(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + endif() + endif() + + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler @ONLY + ) + + install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" - DESTINATION ${KOKKOS_HEADER_DIR}) -ENDMACRO() - + DESTINATION ${KOKKOS_HEADER_DIR} + ) +endmacro() -FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "PLAIN_STYLE" - "" - "" - ${ARGN}) +function(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + cmake_parse_arguments(PARSE "PLAIN_STYLE" "" "" ${ARGN}) - IF((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) + if((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) #I can use link options #check for CXX linkage using the simple 3.18 way - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$<LINK_LANGUAGE:CXX>:${KOKKOS_LINK_OPTIONS}> - ) - ELSE() + target_link_options(${LIBRARY_NAME} PUBLIC $<$<LINK_LANGUAGE:CXX>:${KOKKOS_LINK_OPTIONS}>) + else() #I can use link options #just assume CXX linkage - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() + target_link_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}) + endif() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_COMPILE_OPTIONS}> + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_COMPILE_OPTIONS}> ) - TARGET_COMPILE_DEFINITIONS( - ${LIBRARY_NAME} PUBLIC - $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_COMPILE_DEFINITIONS}> + target_compile_definitions( + ${LIBRARY_NAME} PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_COMPILE_DEFINITIONS}> ) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} - ) + target_link_libraries(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES}) - IF (KOKKOS_ENABLE_CUDA) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_CUDA_OPTIONS}> + if(KOKKOS_ENABLE_CUDA) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_CUDA_OPTIONS}> ) - SET(NODEDUP_CUDAFE_OPTIONS) - FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) - LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${NODEDUP_CUDAFE_OPTIONS}> + set(NODEDUP_CUDAFE_OPTIONS) + foreach(OPT ${KOKKOS_CUDAFE_OPTIONS}) + list(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${NODEDUP_CUDAFE_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_ENABLE_HIP) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_AMDGPU_OPTIONS}> + if(KOKKOS_ENABLE_HIP) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${KOKKOS_AMDGPU_OPTIONS}> ) - ENDIF() - - LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) - IF (XOPT_LENGTH GREATER 1) - MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") - ENDIF() - IF(KOKKOS_XCOMPILER_OPTIONS) - SET(NODEDUP_XCOMPILER_OPTIONS) - FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + endif() + + list(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + if(XOPT_LENGTH GREATER 1) + message( + FATAL_ERROR + "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12" + ) + endif() + if(KOKKOS_XCOMPILER_OPTIONS) + set(NODEDUP_XCOMPILER_OPTIONS) + foreach(OPT ${KOKKOS_XCOMPILER_OPTIONS}) #I have to do this for now because we can't guarantee 3.12 support #I really should do this with the shell option - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${NODEDUP_XCOMPILER_OPTIONS}> + list(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + list(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$<COMPILE_LANGUAGE:${KOKKOS_COMPILE_LANGUAGE}>:${NODEDUP_XCOMPILER_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_CXX_STANDARD_FEATURE) + if(KOKKOS_CXX_STANDARD_FEATURE) #GREAT! I can do this the right way - TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) - IF (NOT KOKKOS_USE_CXX_EXTENSIONS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) - ENDIF() - ELSE() + target_compile_features(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + if(NOT KOKKOS_USE_CXX_EXTENSIONS) + set_target_properties(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + endif() + else() #OH, well, no choice but the wrong way - TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES" - ${ARGN}) - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - FOREACH(source ${PARSE_SOURCES}) + target_compile_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + endif() +endfunction() + +function(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES" ${ARGN}) + + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + foreach(source ${PARSE_SOURCES}) set_source_files_properties(${source} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - ENDFOREACH() + endforeach() - IF(PARSE_STATIC) - SET(LINK_TYPE STATIC) - ENDIF() + if(PARSE_STATIC) + set(LINK_TYPE STATIC) + endif() - IF(PARSE_SHARED) - SET(LINK_TYPE SHARED) - ENDIF() + if(PARSE_SHARED) + set(LINK_TYPE SHARED) + endif() # MSVC and other platforms want to have # the headers included as source files # for better dependency detection - ADD_LIBRARY( - ${LIBRARY_NAME} - ${LINK_TYPE} - ${PARSE_HEADERS} - ${PARSE_SOURCES} - ) + add_library(${LIBRARY_NAME} ${LINK_TYPE} ${PARSE_HEADERS} ${PARSE_SOURCES}) - IF(PARSE_SHARED OR BUILD_SHARED_LIBS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES - VERSION ${Kokkos_VERSION} - SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + if(PARSE_SHARED OR BUILD_SHARED_LIBS) + set_target_properties( + ${LIBRARY_NAME} PROPERTIES VERSION ${Kokkos_VERSION} SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} ) - ENDIF() + endif() - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + kokkos_internal_add_library_install(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name #that matches the install Kokkos:: name - ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "ADD_BUILD_OPTIONS" - "" - "HEADERS" - ${ARGN} - ) - IF (KOKKOS_HAS_TRILINOS) - # We do not pass headers to trilinos. They would get installed - # to the default include folder, but we want headers installed - # preserving the directory structure, e.g. impl - # If headers got installed in both locations, it breaks some - # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} - ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) - ENDIF() - ELSE() - # Forward the headers, we want to know about all headers - # to make sure they appear correctly in IDEs - KOKKOS_INTERNAL_ADD_LIBRARY( - ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) - ENDIF() - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) - ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - IF(KOKKOS_HAS_TRILINOS) - #ignore the target, tribits doesn't do anything directly with targets - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) - ELSE() #append to a list for later - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $<BUILD_INTERFACE:${DIR}>) - ENDFOREACH() - ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_ADD_TEST_DIRECTORIES) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) - ELSE() - IF(KOKKOS_ENABLE_TESTS) - FOREACH(TEST_DIR ${ARGN}) - ADD_SUBDIRECTORY(${TEST_DIR}) - ENDFOREACH() - ENDIF() - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) - else() - IF(KOKKOS_ENABLE_EXAMPLES) - FOREACH(EXAMPLE_DIR ${ARGN}) - ADD_SUBDIRECTORY(${EXAMPLE_DIR}) - ENDFOREACH() - ENDIF() + add_library(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +endfunction() + +function(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "ADD_BUILD_OPTIONS" "" "HEADERS" ${ARGN}) + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + kokkos_internal_add_library(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + if(PARSE_ADD_BUILD_OPTIONS) + kokkos_set_library_properties(${LIBRARY_NAME}) + endif() +endfunction() + +function(KOKKOS_ADD_INTERFACE_LIBRARY NAME) + add_library(${NAME} INTERFACE) + kokkos_internal_add_library_install(${NAME}) +endfunction() + +function(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + foreach(DIR ${ARGN}) + target_include_directories(${TARGET} ${INCTYPE} $<BUILD_INTERFACE:${DIR}>) + endforeach() +endfunction() + +function(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + target_compile_options(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) +endfunction() + +macro(KOKKOS_ADD_TEST_DIRECTORIES) + if(KOKKOS_ENABLE_TESTS) + foreach(TEST_DIR ${ARGN}) + add_subdirectory(${TEST_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if(KOKKOS_ENABLE_EXAMPLES) + foreach(EXAMPLE_DIR ${ARGN}) + add_subdirectory(${EXAMPLE_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_BENCHMARK_DIRECTORIES) + if(KOKKOS_ENABLE_BENCHMARKS) + foreach(BENCHMARK_DIR ${ARGN}) + add_subdirectory(${BENCHMARK_DIR}) + endforeach() endif() -ENDMACRO() - -MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES) - IF(KOKKOS_ENABLE_BENCHMARKS) - FOREACH(BENCHMARK_DIR ${ARGN}) - ADD_SUBDIRECTORY(${BENCHMARK_DIR}) - ENDFOREACH() - ENDIF() -ENDMACRO() +endmacro() diff --git a/packages/kokkos/cmake/msvc.cmake b/packages/kokkos/cmake/msvc.cmake index 85421bdbaaa46dd5d671f4e86b50d52b25d98d30..1de13585c730bd81bb646f5fcec9099cbc4496c1 100644 --- a/packages/kokkos/cmake/msvc.cmake +++ b/packages/kokkos/cmake/msvc.cmake @@ -1,11 +1,9 @@ - -FUNCTION(kokkos_set_msvc_flags full_standard int_standard) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - +function(kokkos_set_msvc_flags full_standard int_standard) + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + endif() +endfunction() diff --git a/packages/kokkos/cmake/pgi.cmake b/packages/kokkos/cmake/pgi.cmake index e98e84955888496225e9268c7db47ef514f08a48..45f59dcd10bf65cdf2d173bc9124c2acc75fd2d2 100644 --- a/packages/kokkos/cmake/pgi.cmake +++ b/packages/kokkos/cmake/pgi.cmake @@ -1,8 +1,6 @@ - function(kokkos_set_pgi_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake deleted file mode 100644 index 4709f8002b11923f1f20801e077ea98172b58c7c..0000000000000000000000000000000000000000 --- a/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake +++ /dev/null @@ -1,26 +0,0 @@ -#@HEADER -# ************************************************************************ -# -# Kokkos v. 4.0 -# Copyright (2022) National Technology & Engineering -# Solutions of Sandia, LLC (NTESS). -# -# Under the terms of Contract DE-NA0003525 with NTESS, -# the U.S. Government retains certain rights in this software. -# -# Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -# -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#@HEADER - -# Check for CUDA support - -IF (NOT TPL_ENABLE_CUDA) - MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") -ELSE() - GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) - GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) -ENDIF() - diff --git a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake index 4e05d22534893961c6ba16e8a1664d312de5fc31..52d8368d0419c1ab335f2bc729f4b17fa7da2ecd 100644 --- a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,8 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) - +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake index 3d5b03805d4d9bd299af7d6ffda62440b223cdce..f51bce5d64d72c688b6881d186a975eaaf85abc2 100644 --- a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -15,29 +15,26 @@ # ************************************************************************ # @HEADER -SET(USE_THREADS FALSE) +set(USE_THREADS FALSE) -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake index 8560ec60f1b52ae2bab9c0c9c9c754c85da373af..b449f45135aa5bbe2f6d06f63031009658e2a44c 100644 --- a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake +++ b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +tribits_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/packages/kokkos/config/test_all_sandia b/packages/kokkos/config/test_all_sandia deleted file mode 100755 index 193a162a4e6e385db674d7b3410fe39f81d4e648..0000000000000000000000000000000000000000 --- a/packages/kokkos/config/test_all_sandia +++ /dev/null @@ -1,773 +0,0 @@ -#!/bin/bash -e - -# -# Global config -# - -set -o pipefail - -# Determine current machine. - -MACHINE="" -HOSTNAME=$(hostname) -PROCESSOR=`uname -p` - -if [[ "$HOSTNAME" =~ (white|ride).* ]]; then - MACHINE=white - module load git -fi - -if [[ "$HOSTNAME" =~ .*bowman.* ]]; then - MACHINE=bowman - module load git -fi - -if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name - if [[ "$PROCESSOR" = "aarch64" ]]; then - MACHINE=sullivan - module load git - fi -fi - -if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name - if [[ "$MACHINE" = "" ]]; then - MACHINE=shepard - module load git - fi -fi - -if [[ "$HOSTNAME" == apollo\.* ]]; then - MACHINE=apollo - module load git -fi - -if [[ "$HOSTNAME" == sullivan ]]; then - MACHINE=sullivan - module load git -fi - -if [[ "$HOSTNAME" == mayer\.* ]]; then - MACHINE=mayer -# module load git -fi -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - -if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then - if [[ "$MACHINE" = "" ]]; then - MACHINE=sems - module load sems-git - fi -fi - -if [[ "$MACHINE" = "" ]]; then - echo "Unrecognized machine" >&2 - exit 1 -fi - -echo "Running on machine: $MACHINE" - -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" -CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" - -GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" -PGI_WARNING_FLAGS="" - -# Default. Machine specific can override. -DEBUG=False -ARGS="" -CUSTOM_BUILD_LIST="" -DRYRUN=False -BUILD_ONLY=False -declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 -TEST_SCRIPT=False -SKIP_HWLOC=False -SPOT_CHECK=False - -PRINT_HELP=False -OPT_FLAG="" -CXX_FLAGS_EXTRA="" -LD_FLAGS_EXTRA="" -KOKKOS_OPTIONS="" - -# -# Handle arguments. -# - -while [[ $# > 0 ]] -do - key="$1" - - case $key in - --kokkos-path*) - KOKKOS_PATH="${key#*=}" - ;; - --build-list*) - CUSTOM_BUILD_LIST="${key#*=}" - ;; - --debug*) - DEBUG=True - ;; - --build-only*) - BUILD_ONLY=True - ;; - --test-script*) - TEST_SCRIPT=True - ;; - --skip-hwloc*) - SKIP_HWLOC=True - ;; - --num*) - NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" - ;; - --dry-run*) - DRYRUN=True - ;; - --spot-check*) - SPOT_CHECK=True - ;; - --arch*) - ARCH_FLAG="--arch=${key#*=}" - ;; - --opt-flag*) - OPT_FLAG="${key#*=}" - ;; - --with-cuda-options*) - KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" - ;; - --with-options*) - KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}" - ;; - --cxxflags-extra*) - CXX_FLAGS_EXTRA="${key#*=}" - ;; - --ldflags-extra*) - LD_FLAGS_EXTRA="${key#*=}" - ;; - --help*) - PRINT_HELP=True - ;; - *) - # args, just append - ARGS="$ARGS $1" - ;; - esac - - shift -done - -SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) - -# Set kokkos path. -if [ -z "$KOKKOS_PATH" ]; then - KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT -else - # Ensure KOKKOS_PATH is abs path. - KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) -fi - -UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` -if ! [ -z "$UNCOMMITTED" ]; then - echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" - echo "$UNCOMMITTED" - echo "" -fi - -GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` -echo "Repository Status: " ${GITSTATUS} -echo "" -echo "" - -# -# Machine specific config. -# - -if [ "$MACHINE" = "sems" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - - BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="" - fi - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi -elif [ "$MACHINE" = "white" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" - IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>" - CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.4.0,ibm/xl/16.1.0" - - # Don't do pthread on white. - GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=Power8,Kepler37" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "bowman" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "sullivan" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-ThunderX" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" - ARM_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-TX2" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "shepard" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" - BASE_MODULE_LIST_INTEL="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=HSW" - fi - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "apollo" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - module use /home/projects/modulefiles/local/x86-64 - module load kokkos-env - - module load sems-git - module load sems-tex - module load sems-cmake/3.5.2 - module load sems-gdb - - SKIP_HWLOC=True - - BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0" - - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - ) - fi - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SNB,Volta70" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -else - echo "Unhandled machine $MACHINE" >&2 - exit 1 -fi - -export OMP_NUM_THREADS=4 - -declare -i NUM_RESULTS_TO_KEEP=7 - -RESULT_ROOT_PREFIX=TestAll - -if [ "$PRINT_HELP" = "True" ]; then - echo "test_all_sandia <ARGS> <OPTIONS>:" - echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" - echo " Defaults to root repo containing this script" - echo "--debug: Run tests in debug. Defaults to False" - echo "--test-script: Test this script, not Kokkos" - echo "--skip-hwloc: Do not do hwloc tests" - echo "--num=N: Number of jobs to run in parallel" - echo "--spot-check: Minimal test set to issue pull request" - echo "--dry-run: Just print what would be executed" - echo "--build-only: Just do builds, don't run anything" - echo "--opt-flag=FLAG: Optimization flag (default: -O3)" - echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" - echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" - echo "--arch=ARCHITECTURE: overwrite architecture flags" - echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" - echo "--build-list=BUILD,BUILD,BUILD..." - echo " Provide a comma-separated list of builds instead of running all builds" - echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" - echo "" - - echo "ARGS: list of expressions matching compilers to test" - echo " supported compilers sems" - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - echo " $COMPILER" - done - echo "" - - echo "Examples:" - echo " Run all tests" - echo " % test_all_sandia" - echo "" - echo " Run all gcc tests" - echo " % test_all_sandia gcc" - echo "" - echo " Run all gcc/4.8.4 and all intel tests" - echo " % test_all_sandia gcc/4.8.4 intel" - echo "" - echo " Run all tests in debug" - echo " % test_all_sandia --debug" - echo "" - echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" - echo "" - echo "If you want to kill the tests, do:" - echo " hit ctrl-z" - echo " % kill -9 %1" - echo - exit 0 -fi - -# Set build type. -if [ "$DEBUG" = "True" ]; then - BUILD_TYPE=debug -else - BUILD_TYPE=release -fi - -# If no args provided, do all compilers. -if [ -z "$ARGS" ]; then - ARGS='?' -fi - -# Process args to figure out which compilers to test. -COMPILERS_TO_TEST="" - -for ARG in $ARGS; do - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - - if [[ "$COMPILER" = $ARG* ]]; then - if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then - COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" - else - echo "Tried to add $COMPILER twice" - fi - fi - done -done - -# -# Functions. -# - -# get_compiler_name <COMPILER> -get_compiler_name() { - echo $1 | cut -d/ -f1 -} - -# get_compiler_version <COMPILER> -get_compiler_version() { - echo $1 | cut -d/ -f2 -} - -# Do not call directly. -get_compiler_data() { - local compiler=$1 - local item=$2 - local compiler_name=$(get_compiler_name $compiler) - local compiler_vers=$(get_compiler_version $compiler) - - local compiler_data - for compiler_data in "${COMPILERS[@]}" ; do - local arr=($compiler_data) - - if [ "$compiler" = "${arr[0]}" ]; then - echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g" - return 0 - fi - done - - # Not found. - echo "Unreconized compiler $compiler" >&2 - exit 1 -} - -# -# For all getters, usage: <GETTER> <COMPILER> -# - -get_compiler_modules() { - get_compiler_data $1 1 -} - -get_compiler_build_list() { - get_compiler_data $1 2 -} - -get_compiler_exe_name() { - get_compiler_data $1 3 -} - -get_compiler_warning_flags() { - get_compiler_data $1 4 -} - -run_cmd() { - echo "RUNNING: $*" - if [ "$DRYRUN" != "True" ]; then - eval "$* 2>&1" - fi -} - -# report_and_log_test_results <SUCCESS> <DESC> <COMMENT> -report_and_log_test_result() { - # Use sane var names. - local success=$1; local desc=$2; local comment=$3; - - if [ "$success" = "0" ]; then - echo " PASSED $desc" - echo $comment > $PASSED_DIR/$desc - else - # For failures, comment should be the name of the phase that failed. - echo " FAILED $desc" >&2 - echo $comment > $FAILED_DIR/$desc - cat ${desc}.${comment}.log - fi -} - -setup_env() { - local compiler=$1 - local compiler_modules=$(get_compiler_modules $compiler) - - module purge - - local mod - for mod in $compiler_modules; do - echo "Loading module $mod" - module load $mod 2>&1 - # It is ridiculously hard to check for the success of a loaded - # module. Module does not return error codes and piping to grep - # causes module to run in a subshell. - module list 2>&1 | grep "$mod" >& /dev/null || return 1 - done - - return 0 -} - -# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE> -single_build_and_test() { - # Use sane var names. - local compiler=$1; local build=$2; local build_type=$3; - - # Set up env. - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" - local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') - setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - - # Set up flags. - local compiler_warning_flags=$(get_compiler_warning_flags $compiler) - local compiler_exe=$(get_compiler_exe_name $compiler) - - if [[ "$build_type" = hwloc* ]]; then - local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) - fi - - if [[ "$OPT_FLAG" = "" ]]; then - OPT_FLAG="-O3" - fi - - if [[ "$build_type" = *debug* ]]; then - local extra_args="$extra_args --debug" - local cxxflags="-g $compiler_warning_flags" - local ldflags="-g" - else - local cxxflags="$OPT_FLAG $compiler_warning_flags" - local ldflags="${OPT_FLAG}" - fi - - local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" - local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" - - if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" - fi - if [[ "$KOKKOS_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_OPTIONS" - else - local extra_args="$extra_args --with-options=enable_large_mem_tests" - fi - - echo " Starting job $desc" - - local comment="no_comment" - - if [ "$TEST_SCRIPT" = "True" ]; then - local rand=$[ 1 + $[ RANDOM % 10 ]] - sleep $rand - - if [ $rand -gt 5 ]; then - run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } - fi - else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - local -i build_start_time=$(date +%s) - run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } - local -i build_end_time=$(date +%s) - comment="build_time=$(($build_end_time-$build_start_time))" - - if [[ "$BUILD_ONLY" == False ]]; then - run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } - local -i run_end_time=$(date +%s) - comment="$comment run_time=$(($run_end_time-$build_end_time))" - fi - fi - - report_and_log_test_result 0 $desc "$comment" - - return 0 -} - -# wait_for_jobs <NUM-JOBS> -wait_for_jobs() { - local -i max_jobs=$1 - local -i num_active_jobs=$(jobs | wc -l) - while [ $num_active_jobs -ge $max_jobs ] - do - sleep 1 - num_active_jobs=$(jobs | wc -l) - jobs >& /dev/null - done -} - -# run_in_background <COMPILER> <BUILD> <BUILD_TYPE> -run_in_background() { - local compiler=$1 - - local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL - # Don't override command line input. - # if [[ "$BUILD_ONLY" == True ]]; then - # num_jobs=8 - # else - if [[ "$compiler" == cuda* ]]; then - num_jobs=1 - fi - if [[ "$compiler" == clang ]]; then - num_jobs=1 - fi - # fi - wait_for_jobs $num_jobs - - single_build_and_test $* & -} - -# build_and_test_all <COMPILER> -build_and_test_all() { - # Get compiler data. - local compiler=$1 - if [ -z "$CUSTOM_BUILD_LIST" ]; then - local compiler_build_list=$(get_compiler_build_list $compiler) - else - local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') - fi - - # Do builds. - local build - for build in $compiler_build_list - do - run_in_background $compiler $build $BUILD_TYPE - - # If not cuda, do a hwloc test too. - if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then - run_in_background $compiler $build "hwloc-$BUILD_TYPE" - fi - done - - return 0 -} - -get_test_root_dir() { - local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) - local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) - local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} - - if [ $num_to_delete -gt 0 ]; then - /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) - fi - - echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") -} - -wait_summarize_and_exit() { - wait_for_jobs 1 - - echo "#######################################################" - echo "PASSED TESTS" - echo "#######################################################" - - local passed_test - for passed_test in $(\ls -1 $PASSED_DIR | sort) - do - echo $passed_test $(cat $PASSED_DIR/$passed_test) - done - - local -i rv=0 - if [ "$(ls -A $FAILED_DIR)" ]; then - echo "#######################################################" - echo "FAILED TESTS" - echo "#######################################################" - - local failed_test - for failed_test in $(\ls -1 $FAILED_DIR | sort) - do - echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" - rv=$rv+1 - done - fi - - exit $rv -} - -# -# Main. -# - -ROOT_DIR=$(get_test_root_dir) -mkdir -p $ROOT_DIR -cd $ROOT_DIR - -PASSED_DIR=$ROOT_DIR/results/passed -FAILED_DIR=$ROOT_DIR/results/failed -mkdir -p $PASSED_DIR -mkdir -p $FAILED_DIR - -echo "Going to test compilers: " $COMPILERS_TO_TEST -for COMPILER in $COMPILERS_TO_TEST; do - echo "Testing compiler $COMPILER" - build_and_test_all $COMPILER -done - -wait_summarize_and_exit diff --git a/packages/kokkos/config/yaml/volta.yaml b/packages/kokkos/config/yaml/volta.yaml deleted file mode 100644 index f67af9c2a44a427f6a8021763bced669cf8b30f6..0000000000000000000000000000000000000000 --- a/packages/kokkos/config/yaml/volta.yaml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - kokkos: - variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1 - compiler: [gcc@7.2.0] diff --git a/packages/kokkos/containers/CMakeLists.txt b/packages/kokkos/containers/CMakeLists.txt index 0857d7007b44b7f5280a8ca5b44f4eec09191951..8ee8bb41a28ab25217e67dc4d1fd0379103f9a66 100644 --- a/packages/kokkos/containers/CMakeLists.txt +++ b/packages/kokkos/containers/CMakeLists.txt @@ -1,9 +1,9 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT KOKKOS_ENABLE_OPENACC) -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) -ENDIF() +if(NOT KOKKOS_ENABLE_OPENACC) + kokkos_add_test_directories(unit_tests) + kokkos_add_test_directories(performance_tests) +endif() diff --git a/packages/kokkos/containers/performance_tests/CMakeLists.txt b/packages/kokkos/containers/performance_tests/CMakeLists.txt index e325e45e85dc0c8fd95fa19cc93e6733e7c37339..8d4d605b087118ece10f61d752273089eac57b4d 100644 --- a/packages/kokkos/containers/performance_tests/CMakeLists.txt +++ b/packages/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,7 +1,6 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOUPPER ${Tag} DEVICE) @@ -10,14 +9,8 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) if(Kokkos_ENABLE_${DEVICE}) message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES - TestMain.cpp - Test${Tag}.cpp - ) + set(SOURCES TestMain.cpp Test${Tag}.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - ContainersPerformanceTest_${Tag} - SOURCES ${SOURCES} - ) + kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) endif() endforeach() diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp index a74f833b9f524a524a2a1bd09b49d1a71940cd7f..953b8bff6e59a3ad7f48cfd0f6283579d1fc0424 100644 --- a/packages/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp @@ -25,8 +25,8 @@ namespace Perf { template <typename ExecSpace, typename Layout, typename Duplication, typename Contribution> void test_scatter_view(int m, int n) { - Kokkos::View<double * [3], Layout, ExecSpace> original_view("original_view", - n); + Kokkos::View<double* [3], Layout, ExecSpace> original_view("original_view", + n); { auto scatter_view = Kokkos::Experimental::create_scatter_view< Kokkos::Experimental::ScatterSum, Duplication, Contribution>( @@ -40,8 +40,8 @@ void test_scatter_view(int m, int n) { { auto num_threads = unique_token.size(); std::cout << "num_threads " << num_threads << '\n'; - Kokkos::View<double* * [3], Layout, ExecSpace> - hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + Kokkos::View<double** [3], Layout, ExecSpace> hand_coded_duplicate_view( + "hand_coded_duplicate", num_threads, n); auto f2 = KOKKOS_LAMBDA(int i) { auto thread_id = unique_token.acquire(); for (int j = 0; j < 10; ++j) { diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt index b7d85ebf11d77b30750d81a3084c9e5f41f0617b..b386fbe67505ba5bb3124c27a08c1cec7dad6f86 100644 --- a/packages/kokkos/containers/src/CMakeLists.txt +++ b/packages/kokkos/containers/src/CMakeLists.txt @@ -1,33 +1,27 @@ #need these here for now -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(KOKKOS_CONTAINERS_SRCS) -APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CONTAINER_HEADERS) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +set(KOKKOS_CONTAINERS_SRCS) +append_glob(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CONTAINER_HEADERS) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) - -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) -KOKKOS_ADD_LIBRARY( - kokkoscontainers - SOURCES ${KOKKOS_CONTAINERS_SRCS} - HEADERS ${KOKKOS_CONTAINERS_HEADERS} -) +kokkos_add_library(kokkoscontainers SOURCES ${KOKKOS_CONTAINERS_SRCS} HEADERS ${KOKKOS_CONTAINERS_HEADERS}) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) +kokkos_link_internal_library(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp index cd5ca4ea512365584c38142adb25afdc20556bd9..409260f0218dbe1bb2ba38a98d5370e5c1551416 100644 --- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp @@ -28,24 +28,6 @@ namespace Kokkos { -namespace Impl { -//! Either append to the label if the property already exists, or set it. -template <typename... P> -auto with_updated_label(const ViewCtorProp<P...>& view_ctor_prop, - const std::string& label) { - using vcp_t = ViewCtorProp<P...>; - //! If the label property is already set, append. Otherwise, set label. - if constexpr (vcp_t::has_label) { - vcp_t new_ctor_props(view_ctor_prop); - static_cast<ViewCtorProp<void, std::string>&>(new_ctor_props) - .value.append(label); - return new_ctor_props; - } else { - return Impl::with_properties_if_unset(view_ctor_prop, label); - } -} -} // namespace Impl - template <typename Device = Kokkos::DefaultExecutionSpace> class Bitset; @@ -92,9 +74,10 @@ class Bitset { using block_view_type = View<unsigned*, Device, MemoryTraits<RandomAccess>>; public: - /// constructor + Bitset() = default; + /// arg_size := number of bit in set - Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {} + Bitset(unsigned arg_size) : Bitset(Kokkos::view_alloc(), arg_size) {} template <class... P> Bitset(const Impl::ViewCtorProp<P...>& arg_prop, unsigned arg_size) @@ -108,9 +91,8 @@ class Bitset { "Allocation properties should not contain the 'pointer' property."); //! Update 'label' property and allocate. - const auto prop_copy = Kokkos::Impl::with_updated_label( - Impl::with_properties_if_unset(arg_prop, std::string("Bitset")), - " - blocks"); + const auto prop_copy = + Impl::with_properties_if_unset(arg_prop, std::string("Bitset")); m_blocks = block_view_type(prop_copy, ((m_size + block_mask) >> block_shift)); @@ -289,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + @@ -310,8 +292,8 @@ class Bitset { } private: - unsigned m_size; - unsigned m_last_block_mask; + unsigned m_size = 0; + unsigned m_last_block_mask = 0; block_view_type m_blocks; private: diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp index 84bced2cc4472c10fcbebb0b11ce6e7ad72d24d1..6a2e6f73a15e92ad4f0071bc218e8967bda05409 100644 --- a/packages/kokkos/containers/src/Kokkos_DualView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp @@ -275,14 +275,29 @@ class DualView : public ViewTraits<DataType, Properties...> { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : modified_flags(t_modified_flags("DualView::modified_flags")), - d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { - // without UVM, host View mirrors - if constexpr (Kokkos::Impl::has_type<Impl::WithoutInitializing_t, - P...>::value) - h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); - else - h_view = Kokkos::create_mirror_view(d_view); + : modified_flags(t_modified_flags("DualView::modified_flags")) { + if constexpr (Impl::ViewCtorProp<P...>::sequential_host_init) { + h_view = t_host(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + static_assert(Impl::ViewCtorProp<P...>::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!Impl::ViewCtorProp<P...>::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); + + d_view = Kokkos::create_mirror_view_and_copy( + typename traits::memory_space{}, h_view); + } else { + d_view = t_dev(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + + // without UVM, host View mirrors + if constexpr (Kokkos::Impl::has_type<Impl::WithoutInitializing_t, + P...>::value) + h_view = + Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); + else + h_view = Kokkos::create_mirror_view(d_view); + } } //! Copy constructor (shallow copy) @@ -292,15 +307,6 @@ class DualView : public ViewTraits<DataType, Properties...> { d_view(src.d_view), h_view(src.h_view) {} - //! Copy assignment operator (shallow copy assignment) - template <typename DT, typename... DP> - DualView& operator=(const DualView<DT, DP...>& src) { - modified_flags = src.modified_flags; - d_view = src.d_view; - h_view = src.h_view; - return *this; - } - //! Subview constructor template <class DT, class... DP, class Arg0, class... Args> DualView(const DualView<DT, DP...>& src, const Arg0& arg0, Args... args) @@ -347,23 +353,21 @@ class DualView : public ViewTraits<DataType, Properties...> { // does the DualView have only one device struct impl_dualview_is_single_device { enum : bool { - value = std::is_same<typename t_dev::device_type, - typename t_host::device_type>::value + value = std::is_same_v<typename t_dev::device_type, + typename t_host::device_type> }; }; // does the given device match the device of t_dev? template <typename Device> struct impl_device_matches_tdev_device { - enum : bool { - value = std::is_same<typename t_dev::device_type, Device>::value - }; + enum : bool { value = std::is_same_v<typename t_dev::device_type, Device> }; }; // does the given device match the device of t_host? template <typename Device> struct impl_device_matches_thost_device { enum : bool { - value = std::is_same<typename t_host::device_type, Device>::value + value = std::is_same_v<typename t_host::device_type, Device> }; }; @@ -371,7 +375,7 @@ class DualView : public ViewTraits<DataType, Properties...> { template <typename Device> struct impl_device_matches_thost_exec { enum : bool { - value = std::is_same<typename t_host::execution_space, Device>::value + value = std::is_same_v<typename t_host::execution_space, Device> }; }; @@ -379,7 +383,7 @@ class DualView : public ViewTraits<DataType, Properties...> { template <typename Device> struct impl_device_matches_tdev_exec { enum : bool { - value = std::is_same<typename t_dev::execution_space, Device>::value + value = std::is_same_v<typename t_dev::execution_space, Device> }; }; @@ -387,8 +391,8 @@ class DualView : public ViewTraits<DataType, Properties...> { template <typename Device> struct impl_device_matches_tdev_memory_space { enum : bool { - value = std::is_same<typename t_dev::memory_space, - typename Device::memory_space>::value + value = std::is_same_v<typename t_dev::memory_space, + typename Device::memory_space> }; }; @@ -398,11 +402,6 @@ class DualView : public ViewTraits<DataType, Properties...> { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the nested if_c expressions in the return - /// value's type. That just tells the method what the return type - /// should be: t_dev if the \c Device template parameter matches - /// this DualView's device type, else t_host. - /// /// For example, suppose you create a DualView on Cuda, like this: /// \code /// using dual_view_type = @@ -419,56 +418,47 @@ class DualView : public ViewTraits<DataType, Properties...> { /// typename dual_view_type::t_host hostView = DV.view<host_device_type> (); /// \endcode template <class Device> - KOKKOS_INLINE_FUNCTION const typename std::conditional_t< - impl_device_matches_tdev_device<Device>::value, t_dev, - typename std::conditional_t< - impl_device_matches_thost_device<Device>::value, t_host, - typename std::conditional_t< - impl_device_matches_thost_exec<Device>::value, t_host, - typename std::conditional_t< - impl_device_matches_tdev_exec<Device>::value, t_dev, - typename std::conditional_t< - impl_device_matches_tdev_memory_space<Device>::value, - t_dev, t_host>>>>> - view() const { - constexpr bool device_is_memspace = - std::is_same<Device, typename Device::memory_space>::value; - constexpr bool device_is_execspace = - std::is_same<Device, typename Device::execution_space>::value; - constexpr bool device_exec_is_t_dev_exec = - std::is_same<typename Device::execution_space, - typename t_dev::execution_space>::value; - constexpr bool device_mem_is_t_dev_mem = - std::is_same<typename Device::memory_space, - typename t_dev::memory_space>::value; - constexpr bool device_exec_is_t_host_exec = - std::is_same<typename Device::execution_space, - typename t_host::execution_space>::value; - constexpr bool device_mem_is_t_host_mem = - std::is_same<typename Device::memory_space, - typename t_host::memory_space>::value; - constexpr bool device_is_t_host_device = - std::is_same<typename Device::execution_space, - typename t_host::device_type>::value; - constexpr bool device_is_t_dev_device = - std::is_same<typename Device::memory_space, - typename t_host::device_type>::value; - - static_assert( - device_is_t_dev_device || device_is_t_host_device || - (device_is_memspace && - (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || - (device_is_execspace && - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || - ((!device_is_execspace && !device_is_memspace) && - ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), - "Template parameter to .view() must exactly match one of the " - "DualView's device types or one of the execution or memory spaces"); - - return Impl::if_c<std::is_same<typename t_dev::memory_space, - typename Device::memory_space>::value, - t_dev, t_host>::select(d_view, h_view); + KOKKOS_FUNCTION auto view() const { + if constexpr (std::is_same_v<Device, typename Device::memory_space>) { + if constexpr (std::is_same_v<typename Device::memory_space, + typename t_dev::memory_space>) { + return d_view; + } else { + static_assert(std::is_same_v<typename Device::memory_space, + typename t_host::memory_space>, + "The template argument is a memory space but doesn't " + "match either of DualView's memory spaces!"); + return h_view; + } + } else { + if constexpr (std::is_same_v<Device, typename Device::execution_space>) { + if constexpr (std::is_same_v<typename Device::execution_space, + typename t_dev::execution_space>) { + return d_view; + } else { + static_assert(std::is_same_v<typename Device::execution_space, + typename t_host::execution_space>, + "The template argument is an execution space but " + "doesn't match either of DualView's execution spaces!"); + return h_view; + } + } else { + static_assert(std::is_same_v<Device, typename Device::device_type>, + "The template argument is neither a memory space, " + "execution space, or device!"); + if constexpr (std::is_same_v<Device, typename t_dev::device_type>) + return d_view; + else { + static_assert(std::is_same_v<Device, typename t_host::device_type>, + "The template argument is a device but " + "doesn't match either of DualView's devices!"); + return h_view; + } + } + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } KOKKOS_INLINE_FUNCTION @@ -484,27 +474,27 @@ class DualView : public ViewTraits<DataType, Properties...> { template <class Device> static int get_device_side() { constexpr bool device_is_memspace = - std::is_same<Device, typename Device::memory_space>::value; + std::is_same_v<Device, typename Device::memory_space>; constexpr bool device_is_execspace = - std::is_same<Device, typename Device::execution_space>::value; + std::is_same_v<Device, typename Device::execution_space>; constexpr bool device_exec_is_t_dev_exec = - std::is_same<typename Device::execution_space, - typename t_dev::execution_space>::value; + std::is_same_v<typename Device::execution_space, + typename t_dev::execution_space>; constexpr bool device_mem_is_t_dev_mem = - std::is_same<typename Device::memory_space, - typename t_dev::memory_space>::value; + std::is_same_v<typename Device::memory_space, + typename t_dev::memory_space>; constexpr bool device_exec_is_t_host_exec = - std::is_same<typename Device::execution_space, - typename t_host::execution_space>::value; + std::is_same_v<typename Device::execution_space, + typename t_host::execution_space>; constexpr bool device_mem_is_t_host_mem = - std::is_same<typename Device::memory_space, - typename t_host::memory_space>::value; + std::is_same_v<typename Device::memory_space, + typename t_host::memory_space>; constexpr bool device_is_t_host_device = - std::is_same<typename Device::execution_space, - typename t_host::device_type>::value; + std::is_same_v<typename Device::execution_space, + typename t_host::device_type>; constexpr bool device_is_t_dev_device = - std::is_same<typename Device::memory_space, - typename t_host::device_type>::value; + std::is_same_v<typename Device::memory_space, + typename t_host::device_type>; static_assert( device_is_t_dev_device || device_is_t_host_device || @@ -636,9 +626,9 @@ class DualView : public ViewTraits<DataType, Properties...> { template <class Device> void sync(const std::enable_if_t< - (std::is_same<typename traits::data_type, - typename traits::non_const_data_type>::value) || - (std::is_same<Device, int>::value), + (std::is_same_v<typename traits::data_type, + typename traits::non_const_data_type>) || + (std::is_same_v<Device, int>), int>& = 0) { sync_impl<Device>(std::true_type{}); } @@ -646,9 +636,9 @@ class DualView : public ViewTraits<DataType, Properties...> { template <class Device, class ExecutionSpace> void sync(const ExecutionSpace& exec, const std::enable_if_t< - (std::is_same<typename traits::data_type, - typename traits::non_const_data_type>::value) || - (std::is_same<Device, int>::value), + (std::is_same_v<typename traits::data_type, + typename traits::non_const_data_type>) || + (std::is_same_v<Device, int>), int>& = 0) { sync_impl<Device>(std::true_type{}, exec); } @@ -678,18 +668,18 @@ class DualView : public ViewTraits<DataType, Properties...> { template <class Device> void sync(const std::enable_if_t< - (!std::is_same<typename traits::data_type, - typename traits::non_const_data_type>::value) || - (std::is_same<Device, int>::value), + (!std::is_same_v<typename traits::data_type, + typename traits::non_const_data_type>) || + (std::is_same_v<Device, int>), int>& = 0) { sync_impl<Device>(std::false_type{}); } template <class Device, class ExecutionSpace> void sync(const ExecutionSpace& exec, const std::enable_if_t< - (!std::is_same<typename traits::data_type, - typename traits::non_const_data_type>::value) || - (std::is_same<Device, int>::value), + (!std::is_same_v<typename traits::data_type, + typename traits::non_const_data_type>) || + (std::is_same_v<Device, int>), int>& = 0) { sync_impl<Device>(std::false_type{}, exec); } @@ -952,14 +942,23 @@ class DualView : public ViewTraits<DataType, Properties...> { Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); if (sizeMismatch) { - ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + ::Kokkos::realloc(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); + ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } } - } else if (alloc_prop_input::initialize) { + } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop); @@ -1047,12 +1046,10 @@ class DualView : public ViewTraits<DataType, Properties...> { /* Resize on Device */ if (sizeMismatch) { ::Kokkos::resize(properties, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); - } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); - } + // this part of the lambda was relocated in a method as it contains a + // `if constexpr`. In some cases, both branches were evaluated + // leading to a compile error + resync_host(properties); /* Mark Device copy as modified */ ++modified_flags(1); @@ -1063,22 +1060,32 @@ class DualView : public ViewTraits<DataType, Properties...> { /* Resize on Host */ if (sizeMismatch) { ::Kokkos::resize(properties, h_view, n0, n1, n2, n3, n4, n5, n6, n7); - if (alloc_prop_input::initialize) { - d_view = create_mirror_view(typename t_dev::memory_space(), h_view); - - } else { - d_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_dev::memory_space(), h_view); - } + // this part of the lambda was relocated in a method as it contains a + // `if constexpr`. In some cases, both branches were evaluated + // leading to a compile error + resync_device(properties); /* Mark Host copy as modified */ ++modified_flags(0); } }; - constexpr bool has_execution_space = alloc_prop_input::has_execution_space; + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!alloc_prop_input::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); - if constexpr (has_execution_space) { + if (sizeMismatch) { + sync<typename t_host::memory_space>(); + ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); + } + return; + } else if constexpr (alloc_prop_input::has_execution_space) { using ExecSpace = typename alloc_prop_input::execution_space; const auto& exec_space = Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop); @@ -1108,6 +1115,39 @@ class DualView : public ViewTraits<DataType, Properties...> { } } + private: + // resync host mirror from device + // this code was relocated from a lambda as it contains a `if constexpr`. + // In some cases, both branches were evaluated, leading to a compile error + template <class... ViewCtorArgs> + inline void resync_host(Impl::ViewCtorProp<ViewCtorArgs...> const&) { + using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; + + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } + } + + // resync device mirror from host + // this code was relocated from a lambda as it contains a `if constexpr` + // In some cases, both branches were evaluated leading to a compile error + template <class... ViewCtorArgs> + inline void resync_device(Impl::ViewCtorProp<ViewCtorArgs...> const&) { + using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; + + if constexpr (alloc_prop_input::initialize) { + d_view = create_mirror_view(typename t_dev::memory_space(), h_view); + + } else { + d_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_dev::memory_space(), h_view); + } + } + + public: void resize(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -1163,15 +1203,15 @@ class DualView : public ViewTraits<DataType, Properties...> { } template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, size_t> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<std::is_integral_v<iType>, + size_t> extent(const iType& r) const { return d_view.extent(r); } template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, int> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<std::is_integral_v<iType>, + int> extent_int(const iType& r) const { return static_cast<int>(d_view.extent(r)); } diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp index 52aa86d8ee4351f14f711ab8da11fe42902b175b..b8603595264c213c767706b188ce0515472a2a7b 100644 --- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -38,6 +38,23 @@ class DynRankView; // forward declare namespace Impl { +template <class T, size_t Rank> +struct ViewDataTypeFromRank { + using type = typename ViewDataTypeFromRank<T, Rank - 1>::type*; +}; + +template <class T> +struct ViewDataTypeFromRank<T, 0> { + using type = T; +}; + +template <unsigned N, typename T, typename... Args> +KOKKOS_FUNCTION View<typename ViewDataTypeFromRank<T, N>::type, Args...> +as_view_of_rank_n( + DynRankView<T, Args...> v, + std::enable_if_t<std::is_same_v<typename ViewTraits<T, Args...>::specialize, + void>>* = nullptr); + template <typename Specialize> struct DynRankDimTraits { enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; @@ -91,54 +108,59 @@ struct DynRankDimTraits { } // Create the layout for the rank-7 view. + // Because the underlying View is rank-7, preserve "unspecified" for + // dimension 8. + // Non-strided Layout template <typename Layout> KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same<Layout, Kokkos::LayoutRight>::value || - std::is_same<Layout, Kokkos::LayoutLeft>::value), + (std::is_same_v<Layout, Kokkos::LayoutRight> || + std::is_same_v<Layout, Kokkos::LayoutLeft>), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + Layout new_layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified); + new_layout.stride = layout.stride; + return new_layout; } // LayoutStride template <typename Layout> KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same<Layout, Kokkos::LayoutStride>::value), Layout> + (std::is_same_v<Layout, Kokkos::LayoutStride>), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.stride[0], - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.stride[1], - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.stride[2], - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.stride[3], - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.stride[4], - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.stride[5], - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.stride[6], - layout.dimension[7] != unspecified ? layout.dimension[7] : 1, - layout.stride[7]); + return Layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified, + layout.stride[7]); } // Extra overload to match that for specialize types template <typename Traits, typename... P> KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same<typename Traits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename Traits::array_layout, Kokkos::LayoutLeft>::value || - std::is_same<typename Traits::array_layout, - Kokkos::LayoutStride>::value), + (std::is_same_v<typename Traits::array_layout, Kokkos::LayoutRight> || + std::is_same_v<typename Traits::array_layout, Kokkos::LayoutLeft> || + std::is_same_v<typename Traits::array_layout, Kokkos::LayoutStride>), typename Traits::array_layout> createLayout(const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, const typename Traits::array_layout& layout) { @@ -164,9 +186,8 @@ struct DynRankDimTraits { // Non-strided Layout template <typename Layout, typename iType> KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same<Layout, Kokkos::LayoutRight>::value || - std::is_same<Layout, Kokkos::LayoutLeft>::value) && - std::is_integral<iType>::value, + (std::is_same_v<Layout, Kokkos::LayoutRight> || + std::is_same_v<Layout, Kokkos::LayoutLeft>)&&std::is_integral_v<iType>, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -182,8 +203,7 @@ reconstructLayout(const Layout& layout, iType dynrank) { // LayoutStride template <typename Layout, typename iType> KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same<Layout, Kokkos::LayoutStride>::value) && - std::is_integral<iType>::value, + (std::is_same_v<Layout, Kokkos::LayoutStride>)&&std::is_integral_v<iType>, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -284,40 +304,43 @@ namespace Impl { template <class DstTraits, class SrcTraits> class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<(std::is_same<typename DstTraits::memory_space, - typename SrcTraits::memory_space>::value && - std::is_void<typename DstTraits::specialize>::value && - std::is_void<typename SrcTraits::specialize>::value && - (std::is_same<typename DstTraits::array_layout, - typename SrcTraits::array_layout>::value || - ((std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutStride>::value) && - (std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value)))), - Kokkos::Impl::ViewToDynRankViewTag>> { + std::enable_if_t< + (std::is_same_v<typename DstTraits::memory_space, + typename SrcTraits::memory_space> && + std::is_void_v<typename DstTraits::specialize> && + std::is_void_v<typename SrcTraits::specialize> && + (std::is_same_v<typename DstTraits::array_layout, + typename SrcTraits::array_layout> || + ((std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v<typename SrcTraits:: + array_layout, + Kokkos::LayoutLeft> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>)))), + Kokkos::Impl::ViewToDynRankViewTag>> { private: enum { is_assignable_value_type = - std::is_same<typename DstTraits::value_type, - typename SrcTraits::value_type>::value || - std::is_same<typename DstTraits::value_type, - typename SrcTraits::const_value_type>::value + std::is_same_v<typename DstTraits::value_type, + typename SrcTraits::value_type> || + std::is_same_v<typename DstTraits::value_type, + typename SrcTraits::const_value_type> }; enum { is_assignable_layout = - std::is_same<typename DstTraits::array_layout, - typename SrcTraits::array_layout>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutStride>::value + std::is_same_v<typename DstTraits::array_layout, + typename SrcTraits::array_layout> || + std::is_same_v<typename DstTraits::array_layout, Kokkos::LayoutStride> }; public: @@ -345,7 +368,7 @@ class ViewMapping< src.layout()); // Check this for integer input1 for padding, etc dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); - dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_track.m_tracker.assign(src.m_track.m_tracker, DstTraits::is_managed); dst.m_rank = Kokkos::View<ST, SP...>::rank(); } }; @@ -378,10 +401,11 @@ struct is_dyn_rank_view<Kokkos::DynRankView<D, P...>> : public std::true_type { template <class T> inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view<T>::value; +// Inherit privately from View, this way we don't import anything funky +// for example the rank member vs the rank() function of DynRankView template <typename DataType, class... Properties> -class DynRankView : public ViewTraits<DataType, Properties...> { - static_assert(!std::is_array<DataType>::value && - !std::is_pointer<DataType>::value, +class DynRankView : private View<DataType*******, Properties...> { + static_assert(!std::is_array_v<DataType> && !std::is_pointer_v<DataType>, "Cannot template DynRankView with array or pointer datatype - " "must be pod"); @@ -391,28 +415,66 @@ class DynRankView : public ViewTraits<DataType, Properties...> { template <class, class...> friend class Kokkos::Impl::ViewMapping; + size_t m_rank{}; + public: using drvtraits = ViewTraits<DataType, Properties...>; using view_type = View<DataType*******, Properties...>; - using traits = ViewTraits<DataType*******, Properties...>; - private: - using map_type = - Kokkos::Impl::ViewMapping<traits, typename traits::specialize>; - using track_type = Kokkos::Impl::SharedAllocationTracker; - - track_type m_track; - map_type m_map; - unsigned m_rank; + using drdtraits = Impl::DynRankDimTraits<typename view_type::specialize>; public: - KOKKOS_INLINE_FUNCTION + // typedefs from ViewTraits, overriden + using data_type = typename drvtraits::data_type; + using const_data_type = typename drvtraits::const_data_type; + using non_const_data_type = typename drvtraits::non_const_data_type; + + // typedefs from ViewTraits not overriden + using value_type = typename view_type::value_type; + using const_value_type = typename view_type::const_value_type; + using non_const_value_type = typename view_type::non_const_value_type; + using traits = typename view_type::traits; + using array_layout = typename view_type::array_layout; + + using execution_space = typename view_type::execution_space; + using memory_space = typename view_type::memory_space; + using device_type = typename view_type::device_type; + + using memory_traits = typename view_type::memory_traits; + using host_mirror_space = typename view_type::host_mirror_space; + using size_type = typename view_type::size_type; + + using reference_type = typename view_type::reference_type; + using pointer_type = typename view_type::pointer_type; + + using scalar_array_type = value_type; + using const_scalar_array_type = const_value_type; + using non_const_scalar_array_type = non_const_value_type; + using specialize = typename view_type::specialize; + + // typedefs in View for mdspan compatibility + // cause issues with MSVC+CUDA + // using layout_type = typename view_type::layout_type; + using index_type = typename view_type::index_type; + using element_type = typename view_type::element_type; + using rank_type = typename view_type::rank_type; + using reference = reference_type; + using data_handle_type = pointer_type; + + KOKKOS_FUNCTION view_type& DownCast() const { return (view_type&)(*this); } - KOKKOS_INLINE_FUNCTION + + // FIXME: this function make NO sense, the above one already is marked const + // Maybe one would want to get back a view of const?? + KOKKOS_FUNCTION const view_type& ConstDownCast() const { return (const view_type&)(*this); } + // FIXME: deprecate DownCast in favor of to_view + // KOKKOS_FUNCTION + // view_type to_view() const { return *this; } + // Types below - at least the HostMirror requires the value_type, NOT the rank // 7 data_type of the traits @@ -436,114 +498,36 @@ class DynRankView : public ViewTraits<DataType, Properties...> { typename drvtraits::array_layout, typename drvtraits::host_mirror_space>; + using host_mirror_type = HostMirror; //---------------------------------------- // Domain rank and extents // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the // enum? - template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, int> - extent_int(const iType& r) const { - return static_cast<int>(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const; - //---------------------------------------- /* Deprecate all 'dimension' functions in favor of * ISO/C++ vocabulary 'extent'. */ - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * - m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * - m_map.extent(6) * m_map.extent(7); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference<reference_type>::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (m_map.data() != nullptr); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>& - impl_map() const { - return m_map; - } - //---------------------------------------- private: enum { is_layout_left = - std::is_same<typename traits::array_layout, Kokkos::LayoutLeft>::value, + std::is_same_v<typename traits::array_layout, Kokkos::LayoutLeft>, is_layout_right = - std::is_same<typename traits::array_layout, Kokkos::LayoutRight>::value, + std::is_same_v<typename traits::array_layout, Kokkos::LayoutRight>, + + is_layout_stride = + std::is_same_v<typename traits::array_layout, Kokkos::LayoutStride>, - is_layout_stride = std::is_same<typename traits::array_layout, - Kokkos::LayoutStride>::value, + is_default_map = std::is_void_v<typename traits::specialize> && + (is_layout_left || is_layout_right || is_layout_stride), - is_default_map = std::is_void<typename traits::specialize>::value && - (is_layout_left || is_layout_right || is_layout_stride) + is_default_access = + is_default_map && std::is_same_v<reference_type, element_type&> }; // Bounds checking macros @@ -570,476 +554,272 @@ class DynRankView : public ViewTraits<DataType, Properties...> { #endif public: - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION constexpr unsigned rank() const { return m_rank; } - // operators () - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type operator()() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding...) - template <typename iType> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same<typename drvtraits::value_type, - typename drvtraits::scalar_array_type>::value && - std::is_integral<iType>::value, - reference_type> - operator[](const iType& i0) const { - // Phalanx is violating this, since they use the operator to access ALL - // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , - // this->rank(), m_track, m_map) ) - return data()[i0]; - } - - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding... AND a Trilinos/Sacado scalar type ) - template <typename iType> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - !std::is_same<typename drvtraits::value_type, - typename drvtraits::scalar_array_type>::value && - std::is_integral<iType>::value, - reference_type> - operator[](const iType& i0) const { - // auto map = impl_map(); - const size_t dim_scalar = m_map.dimension_scalar(); - const size_t bytes = this->span() / dim_scalar; - - using tmp_view_type = Kokkos::View< - DataType*, typename traits::array_layout, typename traits::device_type, - Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged | - traits::memory_traits::is_random_access | - traits::memory_traits::is_atomic>>; - tmp_view_type rankone_view(this->data(), bytes, dim_scalar); - return rankone_view(i0); - } - - // Rank 1 parenthesis - template <typename iType> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void<typename traits::specialize>::value && - std::is_integral<iType>::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename traits::specialize>::value && - std::is_integral<iType>::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template <typename iType0, typename iType1> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template <typename iType0, typename iType1> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template <typename iType0, typename iType1, typename iType2> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template <typename iType0, typename iType1, typename iType2> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template <typename iType0, typename iType1, typename iType2, typename iType3> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template <typename iType0, typename iType1, typename iType2, typename iType3> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value && - std::is_integral<iType4>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4, typename iType5> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value && - std::is_integral<iType4>::value && std::is_integral<iType5>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4, typename iType5> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4, typename iType5, typename iType6> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value && - std::is_integral<iType4>::value && std::is_integral<iType5>::value && - std::is_integral<iType6>::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type access() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // Rank 1 parenthesis - template <typename iType> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void<typename traits::specialize>::value && - std::is_integral<iType>::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename traits::specialize>::value && - std::is_integral<iType>::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template <typename iType0, typename iType1> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); + using view_type::data; + using view_type::extent; + using view_type::extent_int; // FIXME: not tested + using view_type::impl_map; // FIXME: not tested + using view_type::is_allocated; + using view_type::label; + using view_type::size; + using view_type::span; + using view_type::span_is_contiguous; // FIXME: not tested + using view_type::stride; // FIXME: not tested + using view_type::stride_0; // FIXME: not tested + using view_type::stride_1; // FIXME: not tested + using view_type::stride_2; // FIXME: not tested + using view_type::stride_3; // FIXME: not tested + using view_type::stride_4; // FIXME: not tested + using view_type::stride_5; // FIXME: not tested + using view_type::stride_6; // FIXME: not tested + using view_type::stride_7; // FIXME: not tested + using view_type::use_count; + +#ifdef KOKKOS_ENABLE_CUDA + KOKKOS_FUNCTION reference_type + operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, + index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } - - template <typename iType0, typename iType1> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template <typename iType0, typename iType1, typename iType2> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template <typename iType0, typename iType1, typename iType2> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); +#else + // Adding shortcut operators for rank-0 to rank-3 for default layouts + // and access modalities. + // This removes performance overhead for always using rank-7 mapping. + // See https://github.com/kokkos/kokkos/issues/7604 + // When boundschecking is enabled we still go through the underlying + // rank-7 View to leverage the error checks there. + + KOKKOS_FUNCTION reference_type operator()() const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 0u) + Kokkos::abort( + "DynRankView rank 0 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + return view_type::data()[0]; + } else +#endif + return view_type::operator()(0, 0, 0, 0, 0, 0, 0); } - // Rank 4 - template <typename iType0, typename iType1, typename iType2, typename iType3> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); + KOKKOS_FUNCTION reference_type operator()(index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 1u) + Kokkos::abort( + "DynRankView rank 1 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_stride) { + return view_type::data()[i0 * view_type::stride(0)]; + } else { + return view_type::data()[i0]; + } + } else +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } - template <typename iType0, typename iType1, typename iType2, typename iType3> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); + KOKKOS_FUNCTION reference_type operator()(index_type i0, + index_type i1) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 2u) + Kokkos::abort( + "DynRankView rank 2 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_left) { + return view_type::data()[i0 + i1 * view_type::stride(1)]; + } else if constexpr (is_layout_right) { + return view_type::data()[i0 * view_type::extent(1) + i1]; + } else { + return view_type::data()[i0 * view_type::stride(0) + + i1 * view_type::stride(1)]; + } + } else +#endif + return view_type::operator()(i0, i1, 0, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } - // Rank 5 - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value && - std::is_integral<iType4>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); + KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, + index_type i2) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 3u) + Kokkos::abort( + "DynRankView rank 3 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_left) { + return view_type::data()[i0 + view_type::stride(1) * + (i1 + i2 * view_type::extent(1))]; + } else if constexpr (is_layout_right) { + return view_type::data()[(i0 * view_type::extent(1) + i1) * + view_type::extent(2) + + i2]; + } else { + return view_type::data()[i0 * view_type::stride(0) + + i1 * view_type::stride(1) + + i2 * view_type::stride(2)]; + } + } else +#endif + return view_type::operator()(i0, i1, i2, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); + KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, + index_type i2, index_type i3, + index_type i4 = 0, + index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } +#endif - // Rank 6 - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4, typename iType5> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void<typename traits::specialize>::value && - std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value && - std::is_integral<iType4>::value && std::is_integral<iType5>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); +// This is an accomodation for Phalanx, that is usint the operator[] to access +// all elements in a linear fashion even when the rank is not 1 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { + if constexpr (std::is_same_v<typename drvtraits::value_type, + typename drvtraits::scalar_array_type>) { + return view_type::data()[i0]; + } else { + const size_t dim_scalar = view_type::impl_map().dimension_scalar(); + const size_t bytes = view_type::span() / dim_scalar; + + using tmp_view_type = + Kokkos::View<DataType*, typename traits::array_layout, + typename traits::device_type, + Kokkos::MemoryTraits<traits::memory_traits::impl_value | + unsigned(Kokkos::Unmanaged)>>; + tmp_view_type rankone_view(view_type::data(), bytes, dim_scalar); + return rankone_view(i0); + } } - - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4, typename iType5> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<!(std::is_void<typename drvtraits::specialize>::value && - std::is_integral<iType0>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); +#else + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 1u) + Kokkos::abort("DynRankView operator[] can only be used for rank-1"); +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } +#endif - // Rank 7 - template <typename iType0, typename iType1, typename iType2, typename iType3, - typename iType4, typename iType5, typename iType6> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral<iType0>::value && std::is_integral<iType1>::value && - std::is_integral<iType2>::value && std::is_integral<iType3>::value && - std::is_integral<iType4>::value && std::is_integral<iType5>::value && - std::is_integral<iType6>::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5, const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + KOKKOS_FUNCTION reference_type access(index_type i0 = 0, index_type i1 = 0, + index_type i2 = 0, index_type i3 = 0, + index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - //---------------------------------------- // Standard constructor, destructor, and assignment operators... KOKKOS_DEFAULTED_FUNCTION ~DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor - - KOKKOS_INLINE_FUNCTION - DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView(DynRankView&& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + // Make this conditionally explicit? + template <class RT, class... RP> + KOKKOS_FUNCTION DynRankView(const DynRankView<RT, RP...>& rhs) + : view_type(rhs), m_rank(rhs.m_rank) {} - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(const DynRankView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; + template <class RT, class... RP> + KOKKOS_FUNCTION DynRankView& operator=(const DynRankView<RT, RP...>& rhs) { + view_type::operator=(rhs); + m_rank = rhs.m_rank; return *this; } - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(DynRankView&& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; +#if 0 // TODO: this will later be swapped in depending on whether the new View + // impl is active + private: + template <class Ext> + KOKKOS_FUNCTION typename view_type::extents_type create_rank7_extents( + const Ext& ext) { + return typename view_type::extents_type( + ext.rank() > 0 ? ext.extent(0) : 1, ext.rank() > 1 ? ext.extent(1) : 1, + ext.rank() > 2 ? ext.extent(2) : 1, ext.rank() > 3 ? ext.extent(3) : 1, + ext.rank() > 4 ? ext.extent(4) : 1, ext.rank() > 5 ? ext.extent(5) : 1, + ext.rank() > 6 ? ext.extent(6) : 1); } - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. + public: + // Copy/Assign View to DynRankView template <class RT, class... RP> - KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView<RT, RP...>& rhs) - : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { - using SrcTraits = typename DynRankView<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, - typename traits::specialize>; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); + KOKKOS_INLINE_FUNCTION DynRankView(const View<RT, RP...>& rhs, + size_t new_rank) + : view_type(rhs.data_handle(), drdtraits::createLayout(rhs.layout())), + m_rank(new_rank) { + if (new_rank > rhs.rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); } template <class RT, class... RP> - KOKKOS_INLINE_FUNCTION DynRankView& operator=( - const DynRankView<RT, RP...>& rhs) { - using SrcTraits = typename DynRankView<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, - typename traits::specialize>; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - m_track.assign(rhs.m_track, traits::is_managed); + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View<RT, RP...>& rhs) { + view_type::operator=(view_type( + rhs.data_handle(), + typename view_type::mapping_type(create_rank7_extents(rhs.extents())), + rhs.accessor())); m_rank = rhs.rank(); return *this; } - - // Copy/Assign View to DynRankView +#else template <class RT, class... RP> - KOKKOS_INLINE_FUNCTION DynRankView(const View<RT, RP...>& rhs) - : m_track(), m_map(), m_rank(View<RT, RP...>::rank()) { + KOKKOS_FUNCTION DynRankView(const View<RT, RP...>& rhs, size_t new_rank) { using SrcTraits = typename View<RT, RP...>::traits; using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, Kokkos::Impl::ViewToDynRankViewTag>; static_assert(Mapping::is_assignable, - "Incompatible View to DynRankView copy construction"); + "Incompatible View to DynRankView copy assignment"); + if (new_rank > View<RT, RP...>::rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); Mapping::assign(*this, rhs); + m_rank = new_rank; } template <class RT, class... RP> - KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View<RT, RP...>& rhs) { + KOKKOS_FUNCTION DynRankView& operator=(const View<RT, RP...>& rhs) { using SrcTraits = typename View<RT, RP...>::traits; using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, @@ -1047,114 +827,80 @@ class DynRankView : public ViewTraits<DataType, Properties...> { static_assert(Mapping::is_assignable, "Incompatible View to DynRankView copy assignment"); Mapping::assign(*this, rhs); + m_rank = View<RT, RP...>::rank(); return *this; } +#endif + + template <class RT, class... RP> + KOKKOS_FUNCTION DynRankView(const View<RT, RP...>& rhs) + : DynRankView(rhs, View<RT, RP...>::rank()) {} //---------------------------------------- // Allocation tracking properties - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } - - inline const std::string label() const { - return m_track.template get_label<typename traits::memory_space>(); - } - //---------------------------------------- // Allocation according to allocation properties and array layout // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that // rank deduction can properly take place + // We need two variants to avoid calling host function from host device + // function warnings template <class... P> - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), - m_map(), - m_rank(Impl::DynRankDimTraits<typename traits::specialize>:: - template computeRank<typename traits::array_layout, P...>( - arg_prop, arg_layout)) { - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing DynRankView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, - Impl::DynRankDimTraits<typename traits::specialize>:: - template createLayout<traits, P...>(arg_prop, arg_layout), - Impl::ViewCtorProp<P...>::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - } + std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout<traits, P...>( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} - // Wrappers template <class... P> - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, - Impl::DynRankDimTraits<typename traits::specialize>:: - template createLayout<traits, P...>(arg_prop, arg_layout)), - m_rank(Impl::DynRankDimTraits<typename traits::specialize>:: - template computeRank<typename traits::array_layout, P...>( - arg_prop, arg_layout)) { - static_assert( - std::is_same<pointer_type, - typename Impl::ViewCtorProp<P...>::pointer_type>::value, - "Constructing DynRankView to wrap user memory must supply matching " - "pointer type"); - } + std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout<traits, P...>( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} //---------------------------------------- // Constructor(s) // Simple dimension-only layout + // We need two variants to avoid calling host function from host device + // function warnings template <class... P> - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} template <class... P> - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} @@ -1188,16 +934,20 @@ class DynRankView : public ViewTraits<DataType, Properties...> { //---------------------------------------- // Memory span required to wrap these dimensions. + // FIXME: this function needs to be tested static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, + [[maybe_unused]] const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + // FIXME: check that arg_N7 is not set by user (in debug mode) + return view_type::required_allocation_size(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, const size_t arg_N2 = KOKKOS_INVALID_INDEX, const size_t arg_N3 = KOKKOS_INVALID_INDEX, @@ -1205,55 +955,38 @@ class DynRankView : public ViewTraits<DataType, Properties...> { const size_t arg_N5 = KOKKOS_INVALID_INDEX, const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView(Kokkos::Impl::ViewCtorProp<pointer_type>(arg_ptr), arg_N0, - arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + : DynRankView( + Kokkos::Impl::ViewCtorProp<typename view_type::pointer_type>( + arg_ptr), + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, typename traits::array_layout& arg_layout) - : DynRankView(Kokkos::Impl::ViewCtorProp<pointer_type>(arg_ptr), - arg_layout) {} + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp<typename view_type::pointer_type>( + arg_ptr), + arg_layout) {} //---------------------------------------- // Shared scratch memory constructor - static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - const size_t num_passed_args = - (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + - (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + - (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + - (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); - - if (std::is_void<typename traits::specialize>::value && - num_passed_args != traits::rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - {} - - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + // Note: We must pass 7 valid args since view_type is rank 7 + static inline size_t shmem_size( + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + return view_type::shmem_size(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp<pointer_type>( - reinterpret_cast<pointer_type>( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits<typename traits::specialize>:: - createLayout(arg_layout) // is this correct? - )))), - arg_layout) {} + : view_type(arg_space, drdtraits::createLayout(arg_layout)), + m_rank(drdtraits::computeRank(arg_layout)) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, @@ -1264,21 +997,38 @@ class DynRankView : public ViewTraits<DataType, Properties...> { const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp<pointer_type>( - reinterpret_cast<pointer_type>( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits<typename traits::specialize>:: - createLayout(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, - arg_N6, arg_N7)))))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} + : DynRankView(arg_space, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + KOKKOS_FUNCTION constexpr auto layout() const { + switch (rank()) { + case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); + case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); + case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); + case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); + case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); + case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); + case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); + case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); + default: + KOKKOS_IF_ON_HOST( + Kokkos::abort( + std::string( + "Calling DynRankView::layout on DRV of unexpected rank " + + std::to_string(rank())) + .c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "Calling DynRankView::layout on DRV of unexpected rank");) + } + // control flow should never reach here + return view_type::layout(); + } }; template <typename D, class... P> -KOKKOS_INLINE_FUNCTION constexpr unsigned rank( - const DynRankView<D, P...>& DRV) { +KOKKOS_FUNCTION constexpr unsigned rank(const DynRankView<D, P...>& DRV) { return DRV.rank(); } // needed for transition to common constexpr method in view and dynrankview // to return rank @@ -1293,181 +1043,46 @@ struct DynRankSubviewTag {}; } // namespace Impl -namespace Impl { - -template <class SrcTraits, class... Args> -class ViewMapping< - std::enable_if_t<(std::is_void<typename SrcTraits::specialize>::value && - (std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value)), - Kokkos::Impl::DynRankSubviewTag>, - SrcTraits, Args...> { - private: - enum { - RZ = false, - R0 = bool(is_integral_extent<0, Args...>::value), - R1 = bool(is_integral_extent<1, Args...>::value), - R2 = bool(is_integral_extent<2, Args...>::value), - R3 = bool(is_integral_extent<3, Args...>::value), - R4 = bool(is_integral_extent<4, Args...>::value), - R5 = bool(is_integral_extent<5, Args...>::value), - R6 = bool(is_integral_extent<6, Args...>::value) - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) - }; - - using array_layout = Kokkos::LayoutStride; - - using value_type = typename SrcTraits::value_type; - - using data_type = value_type*******; - - public: - using traits_type = Kokkos::ViewTraits<data_type, array_layout, - typename SrcTraits::device_type, - typename SrcTraits::memory_traits>; - - using type = - Kokkos::View<data_type, array_layout, typename SrcTraits::device_type, - typename SrcTraits::memory_traits>; - - template <class MemoryTraits> - struct apply { - static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, ""); - - using traits_type = - Kokkos::ViewTraits<data_type, array_layout, - typename SrcTraits::device_type, MemoryTraits>; - - using type = Kokkos::View<data_type, array_layout, - typename SrcTraits::device_type, MemoryTraits>; - }; - - using dimension = typename SrcTraits::dimension; - - template <class Arg0 = int, class Arg1 = int, class Arg2 = int, - class Arg3 = int, class Arg4 = int, class Arg5 = int, - class Arg6 = int> - struct ExtentGenerator { - KOKKOS_INLINE_FUNCTION - static SubviewExtents<7, rank> generator( - const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), - Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), - Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { - return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, - arg6); - } - }; - - using ret_type = Kokkos::DynRankView<value_type, array_layout, - typename SrcTraits::device_type, - typename SrcTraits::memory_traits>; - - template <typename T, class... P> - KOKKOS_INLINE_FUNCTION static ret_type subview( - const unsigned src_rank, Kokkos::DynRankView<T, P...> const& src, - Args... args) { - using DstType = ViewMapping<traits_type, typename traits_type::specialize>; - - using DstDimType = std::conditional_t< - (rank == 0), ViewDimension<>, - std::conditional_t< - (rank == 1), ViewDimension<0>, - std::conditional_t< - (rank == 2), ViewDimension<0, 0>, - std::conditional_t< - (rank == 3), ViewDimension<0, 0, 0>, - std::conditional_t< - (rank == 4), ViewDimension<0, 0, 0, 0>, - std::conditional_t< - (rank == 5), ViewDimension<0, 0, 0, 0, 0>, - std::conditional_t< - (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, - ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>; - - using dst_offset_type = ViewOffset<DstDimType, Kokkos::LayoutStride>; - using dst_handle_type = typename DstType::handle_type; - - ret_type dst; - - const SubviewExtents<7, rank> extents = ExtentGenerator<Args...>::generator( - src.m_map.m_impl_offset.m_dim, args...); - - dst_offset_type tempdst(src.m_map.m_impl_offset, extents); - - dst.m_track = src.m_track; - - dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; - dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; - dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; - dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; - dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; - dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; - dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; - - dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; - dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; - dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; - dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; - dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; - dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; - dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; - - dst.m_map.m_impl_handle = - dst_handle_type(src.m_map.m_impl_handle + - src.m_map.m_impl_offset( - extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6))); - - dst.m_rank = - (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + - (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + - (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + - (src_rank > 6 ? unsigned(R6) : 0); - - return dst; - } -}; - -} // namespace Impl - template <class V, class... Args> using Subdynrankview = typename Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag, V, Args...>::ret_type; -template <class D, class... P, class... Args> -KOKKOS_INLINE_FUNCTION Subdynrankview<ViewTraits<D*******, P...>, Args...> -subdynrankview(const Kokkos::DynRankView<D, P...>& src, Args... args) { - if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), - // ignore the remaining args - { - Kokkos::abort( - "subdynrankview: num of args must be >= rank of the source " - "DynRankView"); - } - - using metafcn = - Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag, - Kokkos::ViewTraits<D*******, P...>, Args...>; - - return metafcn::subview(src.rank(), src, args...); +template <class... DRVArgs, class SubArg0 = int, class SubArg1 = int, + class SubArg2 = int, class SubArg3 = int, class SubArg4 = int, + class SubArg5 = int, class SubArg6 = int> +KOKKOS_INLINE_FUNCTION auto subdynrankview( + const DynRankView<DRVArgs...>& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + auto sub = subview(drv.DownCast(), arg0, arg1, arg2, arg3, arg4, arg5, arg6); + using sub_t = decltype(sub); + size_t new_rank = (drv.rank() > 0 && !std::is_integral_v<SubArg0> ? 1 : 0) + + (drv.rank() > 1 && !std::is_integral_v<SubArg1> ? 1 : 0) + + (drv.rank() > 2 && !std::is_integral_v<SubArg2> ? 1 : 0) + + (drv.rank() > 3 && !std::is_integral_v<SubArg3> ? 1 : 0) + + (drv.rank() > 4 && !std::is_integral_v<SubArg4> ? 1 : 0) + + (drv.rank() > 5 && !std::is_integral_v<SubArg5> ? 1 : 0) + + (drv.rank() > 6 && !std::is_integral_v<SubArg6> ? 1 : 0); + + using return_type = + DynRankView<typename sub_t::value_type, Kokkos::LayoutStride, + typename sub_t::device_type, typename sub_t::memory_traits>; + return static_cast<return_type>( + DynRankView<typename sub_t::value_type, typename sub_t::array_layout, + typename sub_t::device_type, typename sub_t::memory_traits>( + sub, new_rank)); } - -// Wrapper to allow subview function name -template <class D, class... P, class... Args> -KOKKOS_INLINE_FUNCTION Subdynrankview<ViewTraits<D*******, P...>, Args...> -subview(const Kokkos::DynRankView<D, P...>& src, Args... args) { - return subdynrankview(src, args...); +template <class... DRVArgs, class SubArg0 = int, class SubArg1 = int, + class SubArg2 = int, class SubArg3 = int, class SubArg4 = int, + class SubArg5 = int, class SubArg6 = int> +KOKKOS_INLINE_FUNCTION auto subview( + const DynRankView<DRVArgs...>& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + return subdynrankview(drv, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } } // namespace Kokkos @@ -1482,12 +1097,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView<LT, LP...>& lhs, using lhs_traits = ViewTraits<LT, LP...>; using rhs_traits = ViewTraits<RT, RP...>; - return std::is_same<typename lhs_traits::const_value_type, - typename rhs_traits::const_value_type>::value && - std::is_same<typename lhs_traits::array_layout, - typename rhs_traits::array_layout>::value && - std::is_same<typename lhs_traits::memory_space, - typename rhs_traits::memory_space>::value && + return std::is_same_v<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type> && + std::is_same_v<typename lhs_traits::array_layout, + typename rhs_traits::array_layout> && + std::is_same_v<typename lhs_traits::memory_space, + typename rhs_traits::memory_space> && lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && @@ -1638,11 +1253,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template <unsigned N, typename T, typename... Args> -KOKKOS_FUNCTION auto as_view_of_rank_n( +KOKKOS_FUNCTION View<typename ViewDataTypeFromRank<T, N>::type, Args...> +as_view_of_rank_n( DynRankView<T, Args...> v, - typename std::enable_if<std::is_same< - typename ViewTraits<T, Args...>::specialize, void>::value>::type* = - nullptr) { + std::enable_if_t< + std::is_same_v<typename ViewTraits<T, Args...>::specialize, void>>*) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -1653,8 +1268,16 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - return View<typename RankDataType<T, N>::type, Args...>( - v.data(), v.impl_map().layout()); + auto layout = v.DownCast().layout(); + + if constexpr (std::is_same_v<decltype(layout), Kokkos::LayoutLeft> || + std::is_same_v<decltype(layout), Kokkos::LayoutRight> || + std::is_same_v<decltype(layout), Kokkos::LayoutStride>) { + for (int i = N; i < 7; ++i) + layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; + } + + return View<typename RankDataType<T, N>::type, Args...>(v.data(), layout); } template <typename Function, typename... Args> @@ -1683,43 +1306,16 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView<Args...> a) { } // namespace Impl -template <typename D, class... P> -KOKKOS_INLINE_FUNCTION constexpr auto DynRankView<D, P...>::layout() const -> - typename traits::array_layout { - switch (rank()) { - case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); - case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); - case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); - case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); - case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); - case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); - case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); - case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); - default: - KOKKOS_IF_ON_HOST( - Kokkos::abort( - std::string( - "Calling DynRankView::layout on DRV of unexpected rank " + - std::to_string(rank())) - .c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "Calling DynRankView::layout on DRV of unexpected rank");) - } - // control flow should never reach here - return m_map.layout(); -} - /** \brief Deep copy a value from Host memory into a view. */ template <class ExecSpace, class DT, class... DP> inline void deep_copy( const ExecSpace& e, const DynRankView<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { static_assert( - std::is_same<typename ViewTraits<DT, DP...>::non_const_value_type, - typename ViewTraits<DT, DP...>::value_type>::value, + std::is_same_v<typename ViewTraits<DT, DP...>::non_const_value_type, + typename ViewTraits<DT, DP...>::value_type>, "deep_copy requires non-const type"); Impl::apply_to_view_of_static_rank( @@ -1730,8 +1326,8 @@ template <class DT, class... DP> inline void deep_copy( const DynRankView<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); }, dst); } @@ -1742,8 +1338,8 @@ inline void deep_copy( const ExecSpace& e, typename ViewTraits<ST, SP...>::non_const_value_type& dst, const DynRankView<ST, SP...>& src, - std::enable_if_t<std::is_same<typename ViewTraits<ST, SP...>::specialize, - void>::value>* = 0) { + std::enable_if_t<std::is_same_v<typename ViewTraits<ST, SP...>::specialize, + void>>* = 0) { deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1751,8 +1347,8 @@ template <class ST, class... SP> inline void deep_copy( typename ViewTraits<ST, SP...>::non_const_value_type& dst, const DynRankView<ST, SP...>& src, - std::enable_if_t<std::is_same<typename ViewTraits<ST, SP...>::specialize, - void>::value>* = 0) { + std::enable_if_t<std::is_same_v<typename ViewTraits<ST, SP...>::specialize, + void>>* = 0) { deep_copy(dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1765,15 +1361,13 @@ inline void deep_copy( template <class ExecSpace, class DstType, class SrcType> inline void deep_copy( const ExecSpace& exec_space, const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void<typename DstType::traits::specialize>::value && - std::is_void<typename SrcType::traits::specialize>::value && - (Kokkos::is_dyn_rank_view<DstType>::value || - Kokkos::is_dyn_rank_view<SrcType>::value))>* = nullptr) { - static_assert( - std::is_same<typename DstType::traits::value_type, - typename DstType::traits::non_const_value_type>::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v<typename DstType::traits::specialize> && + std::is_void_v<typename SrcType::traits::specialize> && + (Kokkos::is_dyn_rank_view<DstType>::value || + Kokkos::is_dyn_rank_view<SrcType>::value))>* = nullptr) { + static_assert(std::is_same_v<typename DstType::traits::value_type, + typename DstType::traits::non_const_value_type>, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1818,15 +1412,13 @@ inline void deep_copy( template <class DstType, class SrcType> inline void deep_copy( const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void<typename DstType::traits::specialize>::value && - std::is_void<typename SrcType::traits::specialize>::value && - (Kokkos::is_dyn_rank_view<DstType>::value || - Kokkos::is_dyn_rank_view<SrcType>::value))>* = nullptr) { - static_assert( - std::is_same<typename DstType::traits::value_type, - typename DstType::traits::non_const_value_type>::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v<typename DstType::traits::specialize> && + std::is_void_v<typename SrcType::traits::specialize> && + (Kokkos::is_dyn_rank_view<DstType>::value || + Kokkos::is_dyn_rank_view<SrcType>::value))>* = nullptr) { + static_assert(std::is_same_v<typename DstType::traits::value_type, + typename DstType::traits::non_const_value_type>, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1886,7 +1478,7 @@ struct MirrorDRViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value + std::is_same_v<memory_space, typename src_view_type::memory_space> }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1901,277 +1493,158 @@ struct MirrorDRViewType { std::conditional_t<is_same_memspace, src_view_type, dest_view_type>; }; -template <class Space, class T, class... P> -struct MirrorDRVType { - // The incoming view_type - using src_view_type = typename Kokkos::DynRankView<T, P...>; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::DynRankView<data_type, array_layout, Space>; -}; - } // namespace Impl namespace Impl { -template <class T, class... P, class... ViewCtorArgs> -inline typename DynRankView<T, P...>::HostMirror create_mirror( - const DynRankView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - std::enable_if_t<!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = - nullptr) { - using src_type = DynRankView<T, P...>; - using dst_type = typename src_type::HostMirror; - - using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); -} +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> -inline auto create_mirror( - const DynRankView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - std::enable_if_t<Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = - nullptr) { - using dst_type = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::view_type; - - using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); +inline auto create_mirror(const DynRankView<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { + check_view_ctor_args_create_mirror<ViewCtorArgs...>(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); + if constexpr (Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + using dst_type = typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::dest_view_type; + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } else { + using src_type = DynRankView<T, P...>; + using dst_type = typename src_type::HostMirror; + + return dst_type(prop_copy, + Impl::reconstructLayout(src.layout(), src.rank())); + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } } // namespace Impl -// Create a mirror in host space -template <class T, class... P> -inline typename DynRankView<T, P...>::HostMirror create_mirror( - const DynRankView<T, P...>& src, - std::enable_if_t<std::is_same<typename ViewTraits<T, P...>::specialize, - void>::value>* = nullptr) { - return Impl::create_mirror(src, Kokkos::Impl::ViewCtorProp<>{}); +// public interface +template <class T, class... P, + class Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror(const DynRankView<T, P...>& src) { + return Impl::create_mirror(src, Kokkos::view_alloc()); } -template <class T, class... P> -inline typename DynRankView<T, P...>::HostMirror create_mirror( - Kokkos::Impl::WithoutInitializing_t wi, const DynRankView<T, P...>& src, - std::enable_if_t<std::is_same<typename ViewTraits<T, P...>::specialize, - void>::value>* = nullptr) { +// public interface that accepts a without initializing flag +template <class T, class... P, + class Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, + const DynRankView<T, P...>& src) { return Impl::create_mirror(src, Kokkos::view_alloc(wi)); } -template <class T, class... P, class... ViewCtorArgs> -inline typename DynRankView<T, P...>::HostMirror create_mirror( - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - const DynRankView<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = nullptr) { - return Impl::create_mirror(src, arg_prop); -} - -// Create a mirror in a new space +// public interface that accepts a space template <class Space, class T, class... P, - typename Enable = std::enable_if_t< + class Enable = std::enable_if_t< Kokkos::is_space<Space>::value && - std::is_void<typename ViewTraits<T, P...>::specialize>::value>> -typename Impl::MirrorDRVType<Space, T, P...>::view_type create_mirror( - const Space&, const Kokkos::DynRankView<T, P...>& src) { + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror(const Space&, + const Kokkos::DynRankView<T, P...>& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template <class Space, class T, class... P> -typename Impl::MirrorDRVType<Space, T, P...>::view_type create_mirror( - Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView<T, P...>& src, - std::enable_if_t<std::is_same<typename ViewTraits<T, P...>::specialize, - void>::value>* = nullptr) { +// public interface that accepts a space and a without initializing flag +template <class Space, class T, class... P, + class Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView<T, P...>& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } -template <class T, class... P, class... ViewCtorArgs> -inline auto create_mirror( - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - const DynRankView<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = nullptr) { - using ReturnType = typename Impl::MirrorDRVType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::view_type; - return ReturnType{Impl::create_mirror(src, arg_prop)}; +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class T, class... P, class... ViewCtorArgs, + typename Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, + const DynRankView<T, P...>& src) { + return Impl::create_mirror(src, arg_prop); } namespace Impl { -template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - std::is_same< - typename DynRankView<T, P...>::memory_space, - typename DynRankView<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView<T, P...>::data_type, - typename DynRankView<T, P...>::HostMirror::data_type>::value, - typename DynRankView<T, P...>::HostMirror> -create_mirror_view(const DynRankView<T, P...>& src, - const typename Impl::ViewCtorProp<ViewCtorArgs...>&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - !(std::is_same< - typename DynRankView<T, P...>::memory_space, - typename DynRankView<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView<T, P...>::data_type, - typename DynRankView<T, P...>::HostMirror::data_type>::value), - typename DynRankView<T, P...>::HostMirror> -create_mirror_view( +inline auto create_mirror_view( const DynRankView<T, P...>& src, - const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::view_type> -create_mirror_view(const Kokkos::DynRankView<T, P...>& src, - const typename Impl::ViewCtorProp<ViewCtorArgs...>&) { - return src; + [[maybe_unused]] const typename Impl::ViewCtorProp<ViewCtorArgs...>& + arg_prop) { + if constexpr (!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + if constexpr (std::is_same_v<typename DynRankView<T, P...>::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename DynRankView<T, P...>::data_type, + typename DynRankView<T, P...>::HostMirror::data_type>) { + return typename DynRankView<T, P...>::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDRViewType<typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -inline std::enable_if_t< - Kokkos::is_space< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace, - typename Impl::MirrorDRViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::view_type> -create_mirror_view( - const Kokkos::DynRankView<T, P...>& src, - const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl -// Create a mirror view in host space -template <class T, class... P> -inline std::enable_if_t< - (std::is_same< - typename DynRankView<T, P...>::memory_space, - typename DynRankView<T, P...>::HostMirror::memory_space>::value && - std::is_same<typename DynRankView<T, P...>::data_type, - typename DynRankView<T, P...>::HostMirror::data_type>::value), - typename DynRankView<T, P...>::HostMirror> -create_mirror_view(const Kokkos::DynRankView<T, P...>& src) { - return src; -} - +// public interface template <class T, class... P> -inline std::enable_if_t< - !(std::is_same< - typename DynRankView<T, P...>::memory_space, - typename DynRankView<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView<T, P...>::data_type, - typename DynRankView<T, P...>::HostMirror::data_type>::value), - typename DynRankView<T, P...>::HostMirror> -create_mirror_view(const Kokkos::DynRankView<T, P...>& src) { - return Kokkos::create_mirror(src); +inline auto create_mirror_view(const Kokkos::DynRankView<T, P...>& src) { + return Impl::create_mirror_view(src, Kokkos::view_alloc()); } +// public interface that accepts a without initializing flag template <class T, class... P> inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, const DynRankView<T, P...>& src) { return Impl::create_mirror_view(src, Kokkos::view_alloc(wi)); } -// Create a mirror view in a new space -// FIXME_C++17 Improve SFINAE here. +// public interface that accepts a space template <class Space, class T, class... P, class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -inline typename Impl::MirrorDRViewType<Space, T, P...>::view_type -create_mirror_view( - const Space&, const Kokkos::DynRankView<T, P...>& src, - std::enable_if_t< - Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>* = nullptr) { - return src; +inline auto create_mirror_view(const Space&, + const Kokkos::DynRankView<T, P...>& src) { + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space())); } -// FIXME_C++17 Improve SFINAE here. +// public interface that accepts a space and a without initializing flag template <class Space, class T, class... P, - class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -inline typename Impl::MirrorDRViewType<Space, T, P...>::view_type -create_mirror_view( - const Space& space, const Kokkos::DynRankView<T, P...>& src, - std::enable_if_t< - !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>* = nullptr) { - return Kokkos::create_mirror(space, src); -} - -template <class Space, class T, class... P> + typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::DynRankView<T, P...>& src) { @@ -2179,6 +1652,8 @@ inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> inline auto create_mirror_view( const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, @@ -2186,75 +1661,51 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -template <class... ViewCtorArgs, class T, class... P> +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class... ViewCtorArgs, class T, class... P, + class Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp<ViewCtorArgs...>&, - const Kokkos::DynRankView<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - Impl::MirrorDRViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, + const Kokkos::DynRankView<T, P...>& src) { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template <class... ViewCtorArgs, class T, class... P> -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - const Kokkos::DynRankView<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - !Impl::MirrorDRViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorDRViewType<Space, T, P...>::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property<Impl::LabelTag>(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{ - arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy<ViewCtorArgs...>(); + + if constexpr (Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorDRViewType<Space, T, P...>::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property<Impl::LabelTag>(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{ + arg_prop_copy, Impl::reconstructLayout(src.layout(), src.rank())}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } +#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC) + __builtin_unreachable(); +#endif } template <class Space, class T, class... P> diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp index 12885edbae9238c0c412d6c59c9ad8f9a8dcd50d..caae3f791f0486eb119854c8aacccea47371c6f8 100644 --- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -40,10 +40,10 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; template <typename Space, typename Value> @@ -129,10 +129,10 @@ struct ChunkedArrayManager { /// allocation template <typename Space> struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; Destroy& operator=(const Destroy&) = default; Destroy(std::string label, value_type** arg_chunk, @@ -250,7 +250,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> { // It is assumed that the value_type is trivially copyable; // when this is not the case, potential problems can occur. - static_assert(std::is_void<typename traits::specialize>::value, + static_assert(std::is_void_v<typename traits::specialize>, "DynamicView only implemented for non-specialized View type"); private: @@ -363,7 +363,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> { enum { reference_type_is_lvalue_reference = - std::is_lvalue_reference<reference_type>::value + std::is_lvalue_reference_v<reference_type> }; KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { @@ -463,11 +463,11 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; DynamicView& operator=(const DynamicView&) = default; template <class RT, class... RP> @@ -572,7 +572,7 @@ struct MirrorDynamicViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value + std::is_same_v<memory_space, typename src_view_type::memory_space> }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -590,104 +590,93 @@ struct MirrorDynamicViewType { } // namespace Impl namespace Impl { + +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> -inline auto create_mirror( - const Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - std::enable_if_t<!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = - nullptr) { +inline auto create_mirror(const Kokkos::Experimental::DynamicView<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); + check_view_ctor_args_create_mirror<ViewCtorArgs...>(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - auto ret = typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror( - prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + if constexpr (Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + using MemorySpace = typename alloc_prop_input::memory_space; - ret.resize_serial(src.extent(0)); + auto ret = typename Kokkos::Impl::MirrorDynamicViewType< + MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), + src.chunk_max() * src.chunk_size()); - return ret; -} - -template <class T, class... P, class... ViewCtorArgs> -inline auto create_mirror( - const Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - std::enable_if_t<Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>* = - nullptr) { - using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); - - using MemorySpace = typename alloc_prop_input::memory_space; - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); + ret.resize_serial(src.extent(0)); - auto ret = typename Kokkos::Impl::MirrorDynamicViewType< - MemorySpace, T, P...>::view_type(prop_copy, src.chunk_size(), - src.chunk_max() * src.chunk_size()); + return ret; + } else { + auto ret = typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror( + prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - ret.resize_serial(src.extent(0)); + ret.resize_serial(src.extent(0)); - return ret; + return ret; + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } + } // namespace Impl -// Create a mirror in host space -template <class T, class... P> +// public interface +template <class T, class... P, + typename Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( const Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } -template <class T, class... P> +// public interface that accepts a without initializing flag +template <class T, class... P, + typename Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( Kokkos::Impl::WithoutInitializing_t wi, const Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror(src, Kokkos::view_alloc(wi)); } -// Create a mirror in a new space -template <class Space, class T, class... P> +// public interface that accepts a space +template <class Space, class T, class... P, + typename Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template <class Space, class T, class... P> -typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::DynamicView<T, P...>& src) { +// public interface that accepts a space and a without initializing flag +template <class Space, class T, class... P, + typename Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } -template <class T, class... P, class... ViewCtorArgs> +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class T, class... P, class... ViewCtorArgs, + typename Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, const Kokkos::Experimental::DynamicView<T, P...>& src) { @@ -696,76 +685,56 @@ inline auto create_mirror( namespace Impl { +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { - return src; -} - -template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror> -create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -std::enable_if_t<Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::DynamicView<T, P...>& src, + [[maybe_unused]] const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { + if constexpr (!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + if constexpr (std::is_same_v<typename Kokkos::Experimental::DynamicView< + T, P...>::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space> && + std::is_same_v<typename Kokkos::Experimental::DynamicView< + T, P...>::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>) { + return + typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -std::enable_if_t<!Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl -// Create a mirror view in host space +// public interface template <class T, class... P> inline auto create_mirror_view( const typename Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror_view(src, Impl::ViewCtorProp<>{}); } +// public interface that accepts a without initializing flag template <class T, class... P> inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, @@ -773,15 +742,18 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, Kokkos::view_alloc(wi)); } -// Create a mirror in a new space -template <class Space, class T, class... P> +// public interface that accepts a space +template <class Space, class T, class... P, + class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror_view( const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror_view(src, view_alloc(typename Space::memory_space{})); } -template <class Space, class T, class... P> +// public interface that accepts a space and a without initializing flag +template <class Space, class T, class... P, + class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) { @@ -789,6 +761,8 @@ inline auto create_mirror_view( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> inline auto create_mirror_view( const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, @@ -861,21 +835,17 @@ inline void deep_copy(const View<T, DP...>& dst, using dst_execution_space = typename ViewTraits<T, DP...>::execution_space; using src_memory_space = typename ViewTraits<T, SP...>::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility<dst_execution_space, - src_memory_space>::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } template <class T, class... DP, class... SP> @@ -887,21 +857,17 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst, using dst_execution_space = typename ViewTraits<T, DP...>::execution_space; using src_memory_space = typename ViewTraits<T, SP...>::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility<dst_execution_space, - src_memory_space>::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } namespace Impl { @@ -985,80 +951,57 @@ struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>, } // namespace Impl -template <class... ViewCtorArgs, class T, class... P> +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class... ViewCtorArgs, class T, class... P, + class Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp<ViewCtorArgs...>&, - const Kokkos::Experimental::DynamicView<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, + const Kokkos::Experimental::DynamicView<T, P...>& src) { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; -} -template <class... ViewCtorArgs, class T, class... P> -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - const Kokkos::Experimental::DynamicView<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - !Impl::MirrorDynamicViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace>* = nullptr) { - using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = - typename Impl::MirrorDynamicViewType<Space, T, P...>::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property<Impl::LabelTag>(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type( - arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); - mirror.resize_serial(src.extent(0)); - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + Impl::check_view_ctor_args_create_mirror_view_and_copy<ViewCtorArgs...>(); + + if constexpr (Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = + typename Impl::MirrorDynamicViewType<Space, T, P...>::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property<Impl::LabelTag>(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type( + arg_prop_copy, src.chunk_size(), src.chunk_max() * src.chunk_size()); + mirror.resize_serial(src.extent(0)); + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } +#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC) + __builtin_unreachable(); +#endif } -template <class Space, class T, class... P> +template <class Space, class T, class... P, + typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> auto create_mirror_view_and_copy( const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src, std::string const& name = "") { diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp index 92aead28784fd0acb1499767e124f743b894b1a4..cf23c25b86bdcd6e93496706122c53d273b8ed67 100644 --- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -50,9 +50,9 @@ inline constexpr bool is_offset_view_v = is_offset_view<T>::value; #define KOKKOS_INVALID_INDEX_RANGE \ { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } -template <typename iType, std::enable_if_t<std::is_integral<iType>::value && - std::is_signed<iType>::value, - iType> = 0> +template <typename iType, + std::enable_if_t<std::is_integral_v<iType> && std::is_signed_v<iType>, + iType> = 0> using IndexRange = Kokkos::Array<iType, 2>; using index_list_type = std::initializer_list<int64_t>; @@ -118,21 +118,14 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( (enum {LEN = 1024}; char buffer[LEN]; const std::string label = tracker.template get_label<MemorySpace>(); int n = snprintf(buffer, LEN, - "OffsetView bounds error of view labeled %s (", - label.c_str()); + "OffsetView bounds error of view labeled %s (", + label.c_str()); offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if it is not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ - if (tracker.has_record()) { - Kokkos::Impl::operator_bounds_error_on_device(map); - } else { Kokkos::abort("OffsetView bounds error"); })) + Kokkos::abort(buffer);)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) } } @@ -187,44 +180,40 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, } // namespace Impl template <class DataType, class... Properties> -class OffsetView : public ViewTraits<DataType, Properties...> { - public: - using traits = ViewTraits<DataType, Properties...>; - +class OffsetView : public View<DataType, Properties...> { private: template <class, class...> friend class OffsetView; - template <class, class...> - friend class View; // FIXME delete this line - template <class, class...> - friend class Kokkos::Impl::ViewMapping; - using map_type = Kokkos::Impl::ViewMapping<traits, void>; - using track_type = Kokkos::Impl::SharedAllocationTracker; + using base_t = View<DataType, Properties...>; public: - enum { Rank = map_type::Rank }; - using begins_type = Kokkos::Array<int64_t, Rank>; + // typedefs to reduce typing base_t:: further down + using traits = typename base_t::traits; + // FIXME: should be base_t::index_type after refactor + using index_type = typename base_t::memory_space::size_type; + using pointer_type = typename base_t::pointer_type; + + using begins_type = Kokkos::Array<int64_t, base_t::rank()>; template <typename iType, - std::enable_if_t<std::is_integral<iType>::value, iType> = 0> + std::enable_if_t<std::is_integral_v<iType>, iType> = 0> KOKKOS_FUNCTION int64_t begin(const iType local_dimension) const { - return local_dimension < Rank ? m_begins[local_dimension] - : KOKKOS_INVALID_OFFSET; + return static_cast<size_t>(local_dimension) < base_t::rank() + ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; } KOKKOS_FUNCTION begins_type begins() const { return m_begins; } template <typename iType, - std::enable_if_t<std::is_integral<iType>::value, iType> = 0> + std::enable_if_t<std::is_integral_v<iType>, iType> = 0> KOKKOS_FUNCTION int64_t end(const iType local_dimension) const { - return begin(local_dimension) + m_map.extent(local_dimension); + return begin(local_dimension) + base_t::extent(local_dimension); } private: - track_type m_track; - map_type m_map; begins_type m_begins; public: @@ -252,560 +241,60 @@ class OffsetView : public ViewTraits<DataType, Properties...> { typename traits::array_layout, typename traits::host_mirror_space>; - //---------------------------------------- - // Domain rank and extents - - /** \brief rank() to be implemented - */ - // KOKKOS_FUNCTION - // static - // constexpr unsigned rank() { return map_type::Rank; } - - template <typename iType> - KOKKOS_FUNCTION constexpr std::enable_if_t<std::is_integral<iType>::value, - size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template <typename iType> - KOKKOS_FUNCTION constexpr std::enable_if_t<std::is_integral<iType>::value, - int> - extent_int(const iType& r) const { - return static_cast<int>(m_map.extent(r)); - } - - KOKKOS_FUNCTION constexpr typename traits::array_layout layout() const { - return m_map.layout(); - } - - KOKKOS_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template <typename iType> - KOKKOS_FUNCTION constexpr std::enable_if_t<std::is_integral<iType>::value, - size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template <typename iType> - KOKKOS_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference<reference_type>::value - }; - - KOKKOS_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_FUNCTION - const Kokkos::Impl::ViewMapping<traits, void>& implementation_map() const { - return m_map; + template <size_t... I, class... OtherIndexTypes> + KOKKOS_FUNCTION typename base_t::reference_type offset_operator( + std::integer_sequence<size_t, I...>, OtherIndexTypes... indices) const { + return base_t::operator()((indices - m_begins[I])...); } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same<typename traits::array_layout, Kokkos::LayoutLeft>::value; - - static constexpr bool is_layout_right = - std::is_same<typename traits::array_layout, Kokkos::LayoutRight>::value; - - static constexpr bool is_layout_stride = - std::is_same<typename traits::array_layout, Kokkos::LayoutStride>::value; - - static constexpr bool is_default_map = - std::is_void<typename traits::specialize>::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ - ARG; - -#else - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); - + template <class OtherIndexType> +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_convertible_v<OtherIndexType, index_type> && + std::is_nothrow_constructible_v<index_type, OtherIndexType> && + (base_t::rank() == 1)) #endif - public: - //------------------------------ - // Rank 0 operator() - - KOKKOS_FORCEINLINE_FUNCTION - reference_type operator()() const { return m_map.reference(); } - //------------------------------ - // Rank 1 operator() - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - //------------------------------ - // Rank 1 operator[] - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && !is_default_map), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0>::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - - //------------------------------ - // Rank 2 - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1>::value && - (2 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.reference(j0, j1); - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) && - is_default_map && is_layout_left && (traits::rank_dynamic == 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) && - is_default_map && is_layout_left && (traits::rank_dynamic != 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) && - is_default_map && is_layout_right && (traits::rank_dynamic == 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1>::value && (2 == Rank) && - is_default_map && is_layout_right && (traits::rank_dynamic != 0)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1>::value && - (2 == Rank) && is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; - } - - //------------------------------ - // Rank 3 - - template <typename I0, typename I1, typename I2> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2>::value && - (3 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; - } - - template <typename I0, typename I1, typename I2> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2>::value && - (3 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.reference(j0, j1, j2); - } - - //------------------------------ - // Rank 4 - - template <typename I0, typename I1, typename I2, typename I3> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && - (4 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; - } - - template <typename I0, typename I1, typename I2, typename I3> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && - (4 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.reference(j0, j1, j2, j3); - } - - //------------------------------ - // Rank 5 - - template <typename I0, typename I1, typename I2, typename I3, typename I4> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && - (5 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && - (5 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.reference(j0, j1, j2, j3, j4); - } - - //------------------------------ - // Rank 6 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value && - (6 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value && - (6 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.reference(j0, j1, j2, j3, j4, j5); - } - - //------------------------------ - // Rank 7 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value && - (7 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value && - (7 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6); + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator[]( + const OtherIndexType& idx) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_convertible_v<OtherIndexType, index_type> && + std::is_nothrow_constructible_v<index_type, OtherIndexType> && + (base_t::rank() == 1)); +#endif + return base_t::operator[](idx - m_begins[0]); } - //------------------------------ - // Rank 8 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename I7> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value && - (8 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map - .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; + template <class... OtherIndexTypes> +#ifndef KOKKOS_ENABLE_CXX17 + requires((std::is_convertible_v<OtherIndexTypes, index_type> && ...) && + (std::is_nothrow_constructible_v<index_type, OtherIndexTypes> && + ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())) +#endif + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator()( + OtherIndexTypes... indices) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert( + (std::is_convertible_v<OtherIndexTypes, index_type> && ...) && + (std::is_nothrow_constructible_v<index_type, OtherIndexTypes> && ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())); +#endif + return offset_operator(std::make_index_sequence<base_t::rank()>(), + indices...); } - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename I7> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value && - (8 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); - } + template <class... OtherIndexTypes> + KOKKOS_FUNCTION constexpr typename base_t::reference_type access( + OtherIndexTypes... args) const = delete; -#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + //---------------------------------------- //---------------------------------------- // Standard destructor, constructors, and assignment operators - KOKKOS_DEFAULTED_FUNCTION - ~OffsetView() = default; - KOKKOS_FUNCTION - OffsetView() : m_track(), m_map() { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; - } - - KOKKOS_FUNCTION - OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(rhs.m_map), - m_begins(rhs.m_begins) {} - - KOKKOS_FUNCTION - OffsetView(OffsetView&& rhs) - : m_track(std::move(rhs.m_track)), - m_map(std::move(rhs.m_map)), - m_begins(std::move(rhs.m_begins)) {} - - KOKKOS_FUNCTION - OffsetView& operator=(const OffsetView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_begins = rhs.m_begins; - return *this; - } - - KOKKOS_FUNCTION - OffsetView& operator=(OffsetView&& rhs) { - m_track = std::move(rhs.m_track); - m_map = std::move(rhs.m_map); - m_begins = std::move(rhs.m_begins); - return *this; + OffsetView() : base_t() { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = KOKKOS_INVALID_OFFSET; } // interoperability with View @@ -816,20 +305,10 @@ class OffsetView : public ViewTraits<DataType, Properties...> { public: KOKKOS_FUNCTION - view_type view() const { - view_type v(m_track, m_map); - return v; - } + view_type view() const { return *this; } template <class RT, class... RP> - KOKKOS_FUNCTION OffsetView(const View<RT, RP...>& aview) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - + KOKKOS_FUNCTION OffsetView(const View<RT, RP...>& aview) : base_t(aview) { for (size_t i = 0; i < View<RT, RP...>::rank(); ++i) { m_begins[i] = 0; } @@ -838,19 +317,14 @@ class OffsetView : public ViewTraits<DataType, Properties...> { template <class RT, class... RP> KOKKOS_FUNCTION OffsetView(const View<RT, RP...>& aview, const index_list_type& minIndices) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(aview) { + KOKKOS_IF_ON_HOST( + (Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, aview.label());)) + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -858,27 +332,13 @@ class OffsetView : public ViewTraits<DataType, Properties...> { template <class RT, class... RP> KOKKOS_FUNCTION OffsetView(const View<RT, RP...>& aview, const begins_type& beg) - : m_track(aview.impl_track()), m_map(), m_begins(beg) { - using SrcTraits = typename OffsetView<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - } + : base_t(aview), m_begins(beg) {} // may assign unmanaged from managed. template <class RT, class... RP> KOKKOS_FUNCTION OffsetView(const OffsetView<RT, RP...>& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(), - m_begins(rhs.m_begins) { - using SrcTraits = typename OffsetView<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? - } + : base_t(rhs.view()), m_begins(rhs.m_begins) {} private: enum class subtraction_failure { @@ -917,7 +377,7 @@ class OffsetView : public ViewTraits<DataType, Properties...> { static subtraction_failure runtime_check_begins_ends_host(const B& begins, const E& ends) { std::string message; - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) message += "begins.size() " "(" + @@ -925,19 +385,19 @@ class OffsetView : public ViewTraits<DataType, Properties...> { ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) message += "ends.size() " "(" + - std::to_string(begins.size()) + + std::to_string(ends.size()) + ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; @@ -979,7 +439,7 @@ class OffsetView : public ViewTraits<DataType, Properties...> { message = "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + message; - Kokkos::Impl::throw_runtime_exception(message); + Kokkos::abort(message.c_str()); } return subtraction_failure::none; @@ -989,11 +449,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> { template <typename B, typename E> KOKKOS_FUNCTION static subtraction_failure runtime_check_begins_ends_device( const B& begins, const E& ends) { - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: begins has bad Rank"); - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: ends has bad Rank"); @@ -1031,20 +491,25 @@ class OffsetView : public ViewTraits<DataType, Properties...> { // Precondition: begins.size() == ends.size() == m_begins.size() == Rank template <typename B, typename E> KOKKOS_FUNCTION OffsetView(const pointer_type& p, const B& begins_, - const E& ends_, - subtraction_failure) - : m_track() // no tracking - , - m_map(Kokkos::Impl::ViewCtorProp<pointer_type>(p), - typename traits::array_layout( - Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, - Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, - Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, - Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, - Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, - Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, - Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, - Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + const E& ends_, subtraction_failure) + : base_t(Kokkos::view_wrap(p), + typename traits::array_layout( + base_t::rank() > 0 ? at(ends_, 0) - at(begins_, 0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 1 ? at(ends_, 1) - at(begins_, 1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 2 ? at(ends_, 2) - at(begins_, 2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 3 ? at(ends_, 3) - at(begins_, 3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 4 ? at(ends_, 4) - at(begins_, 4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 5 ? at(ends_, 5) - at(begins_, 5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 6 ? at(ends_, 6) - at(begins_, 6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 7 ? at(ends_, 7) - at(begins_, 7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG)) { for (size_t i = 0; i != m_begins.size(); ++i) { m_begins[i] = at(begins_, i); }; @@ -1078,15 +543,6 @@ class OffsetView : public ViewTraits<DataType, Properties...> { : OffsetView(p, begins_, ends_, runtime_check_begins_ends(begins_, ends_)) {} - //---------------------------------------- - // Allocation tracking properties - KOKKOS_FUNCTION - int use_count() const { return m_track.use_count(); } - - const std::string label() const { - return m_track.template get_label<typename traits::memory_space>(); - } - // Choosing std::pair as type for the arguments allows constructing an // OffsetView using list initialization syntax, e.g., // OffsetView dummy("dummy", {-1, 3}, {-2,2}); @@ -1108,18 +564,34 @@ class OffsetView : public ViewTraits<DataType, Properties...> { const std::pair<int64_t, int64_t> range7 = KOKKOS_INVALID_INDEX_RANGE ) - : OffsetView( - Kokkos::Impl::ViewCtorProp<std::string>(arg_label), - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(Kokkos::Impl::ViewCtorProp<std::string>(arg_label), + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG - 1 + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template <class... P> explicit OffsetView( @@ -1132,18 +604,34 @@ class OffsetView : public ViewTraits<DataType, Properties...> { const std::pair<int64_t, int64_t> range5 = KOKKOS_INVALID_INDEX_RANGE, const std::pair<int64_t, int64_t> range6 = KOKKOS_INVALID_INDEX_RANGE, const std::pair<int64_t, int64_t> range7 = KOKKOS_INVALID_INDEX_RANGE) - : OffsetView( - arg_prop, - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(arg_prop, + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template <class... P> explicit KOKKOS_FUNCTION OffsetView( @@ -1151,9 +639,14 @@ class OffsetView : public ViewTraits<DataType, Properties...> { std::enable_if_t<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { + : base_t(arg_prop, arg_layout) { + KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, + base_t::label());)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -1170,42 +663,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> { std::enable_if_t<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track(), - m_map() - - { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; - - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Kokkos::Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "OffsetView allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing OffsetView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, - Kokkos::Impl::ViewCtorProp<P...>::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(arg_prop, arg_layout) { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = minIndices.begin()[i]; } }; @@ -1215,7 +675,7 @@ class OffsetView : public ViewTraits<DataType, Properties...> { */ template <typename D, class... P> KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView<D, P...>& V) { - return V.Rank; + return V.rank(); } // Temporary until added to view //---------------------------------------------------------------------------- @@ -1223,8 +683,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView<D, P...>& V) { namespace Impl { template <class T> -KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, T> -shift_input(const T arg, const int64_t offset) { +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral_v<T>, T> shift_input( + const T arg, const int64_t offset) { return arg - offset; } @@ -1235,13 +695,13 @@ Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { template <class T> KOKKOS_INLINE_FUNCTION - std::enable_if_t<std::is_integral<T>::value, Kokkos::pair<T, T>> + std::enable_if_t<std::is_integral_v<T>, Kokkos::pair<T, T>> shift_input(const Kokkos::pair<T, T> arg, const int64_t offset) { return Kokkos::make_pair<T, T>(arg.first - offset, arg.second - offset); } template <class T> -inline std::enable_if_t<std::is_integral<T>::value, std::pair<T, T>> -shift_input(const std::pair<T, T> arg, const int64_t offset) { +inline std::enable_if_t<std::is_integral_v<T>, std::pair<T, T>> shift_input( + const std::pair<T, T> arg, const int64_t offset) { return std::make_pair<T, T>(arg.first - offset, arg.second - offset); } @@ -1250,7 +710,7 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( const size_t i, Kokkos::Array<int64_t, N>& subviewBegins, std::enable_if_t<N != 0, const Arg> shiftedArg, const Arg arg, const A viewBegins, size_t& counter) { - if (!std::is_integral<Arg>::value) { + if (!std::is_integral_v<Arg>) { subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; counter++; } @@ -1659,7 +1119,7 @@ KOKKOS_INLINE_FUNCTION ViewTraits<D, P...>, Args...>::type>::type subview(const OffsetView<D, P...>& src, Args... args) { static_assert( - OffsetView<D, P...>::Rank == sizeof...(Args), + OffsetView<D, P...>::rank() == sizeof...(Args), "subview requires one argument for each source OffsetView rank"); return Kokkos::Experimental::Impl::subview_offset(src, args...); @@ -1679,12 +1139,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs, using lhs_traits = ViewTraits<LT, LP...>; using rhs_traits = ViewTraits<RT, RP...>; - return std::is_same<typename lhs_traits::const_value_type, - typename rhs_traits::const_value_type>::value && - std::is_same<typename lhs_traits::array_layout, - typename rhs_traits::array_layout>::value && - std::is_same<typename lhs_traits::memory_space, - typename rhs_traits::memory_space>::value && + return std::is_same_v<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type> && + std::is_same_v<typename lhs_traits::array_layout, + typename rhs_traits::array_layout> && + std::is_same_v<typename lhs_traits::memory_space, + typename rhs_traits::memory_space> && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1710,12 +1170,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs, using lhs_traits = ViewTraits<LT, LP...>; using rhs_traits = ViewTraits<RT, RP...>; - return std::is_same<typename lhs_traits::const_value_type, - typename rhs_traits::const_value_type>::value && - std::is_same<typename lhs_traits::array_layout, - typename rhs_traits::array_layout>::value && - std::is_same<typename lhs_traits::memory_space, - typename rhs_traits::memory_space>::value && + return std::is_same_v<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type> && + std::is_same_v<typename lhs_traits::array_layout, + typename rhs_traits::array_layout> && + std::is_same_v<typename lhs_traits::memory_space, + typename rhs_traits::memory_space> && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1742,11 +1202,11 @@ template <class DT, class... DP> inline void deep_copy( const Experimental::OffsetView<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { static_assert( - std::is_same<typename ViewTraits<DT, DP...>::non_const_value_type, - typename ViewTraits<DT, DP...>::value_type>::value, + std::is_same_v<typename ViewTraits<DT, DP...>::non_const_value_type, + typename ViewTraits<DT, DP...>::value_type>, "deep_copy requires non-const type"); auto dstView = dst.view(); @@ -1757,11 +1217,11 @@ template <class DT, class... DP, class ST, class... SP> inline void deep_copy( const Experimental::OffsetView<DT, DP...>& dst, const Experimental::OffsetView<ST, SP...>& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { static_assert( - std::is_same<typename ViewTraits<DT, DP...>::value_type, - typename ViewTraits<ST, SP...>::non_const_value_type>::value, + std::is_same_v<typename ViewTraits<DT, DP...>::value_type, + typename ViewTraits<ST, SP...>::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1771,11 +1231,11 @@ template <class DT, class... DP, class ST, class... SP> inline void deep_copy( const Experimental::OffsetView<DT, DP...>& dst, const View<ST, SP...>& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { static_assert( - std::is_same<typename ViewTraits<DT, DP...>::value_type, - typename ViewTraits<ST, SP...>::non_const_value_type>::value, + std::is_same_v<typename ViewTraits<DT, DP...>::value_type, + typename ViewTraits<ST, SP...>::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1786,11 +1246,11 @@ template <class DT, class... DP, class ST, class... SP> inline void deep_copy( const View<DT, DP...>& dst, const Experimental::OffsetView<ST, SP...>& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { static_assert( - std::is_same<typename ViewTraits<DT, DP...>::value_type, - typename ViewTraits<ST, SP...>::non_const_value_type>::value, + std::is_same_v<typename ViewTraits<DT, DP...>::value_type, + typename ViewTraits<ST, SP...>::non_const_value_type>, "deep_copy requires matching non-const destination type"); Kokkos::deep_copy(dst, value.view()); @@ -1808,7 +1268,7 @@ struct MirrorOffsetViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value + std::is_same_v<memory_space, typename src_view_type::memory_space> }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1824,103 +1284,90 @@ struct MirrorOffsetViewType { std::conditional_t<is_same_memspace, src_view_type, dest_view_type>; }; -template <class Space, class T, class... P> -struct MirrorOffsetType { - // The incoming view_type - using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it.) - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = - Kokkos::Experimental::OffsetView<data_type, array_layout, Space>; -}; - } // namespace Impl namespace Impl { -template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space, - typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> -create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror( - Kokkos::create_mirror(arg_prop, src.view()), src.begins()); -} -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class T, class... P, class... ViewCtorArgs> inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space; - - static_assert( - !alloc_prop_input::has_label, - "The view constructor arguments passed to Kokkos::create_mirror " - "must not include a label!"); - static_assert( - !alloc_prop_input::has_pointer, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a pointer!"); - static_assert( - !alloc_prop_input::allow_padding, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not explicitly allow padding!"); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - - return typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + check_view_ctor_args_create_mirror<ViewCtorArgs...>(); + + if constexpr (Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space; + + auto prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string(src.label()).append("_mirror")); + + return typename Kokkos::Impl::MirrorOffsetViewType< + Space, T, P...>::dest_view_type(prop_copy, src.layout(), + {src.begin(0), src.begin(1), + src.begin(2), src.begin(3), + src.begin(4), src.begin(5), + src.begin(6), src.begin(7)}); + } else { + return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror( + Kokkos::create_mirror(arg_prop, src.view()), src.begins()); + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } + } // namespace Impl -// Create a mirror in host space -template <class T, class... P> +// public interface +template <class T, class... P, + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( const Kokkos::Experimental::OffsetView<T, P...>& src) { return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } -template <class T, class... P> +// public interface that accepts a without initializing flag +template <class T, class... P, + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( Kokkos::Impl::WithoutInitializing_t wi, const Kokkos::Experimental::OffsetView<T, P...>& src) { return Impl::create_mirror(src, Kokkos::view_alloc(wi)); } -// Create a mirror in a new space +// public interface that accepts a space template <class Space, class T, class... P, - typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> + typename Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template <class Space, class T, class... P> -typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::OffsetView<T, P...>& src) { +// public interface that accepts a space and a without initializing flag +template <class Space, class T, class... P, + typename Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::OffsetView<T, P...>& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } -template <class T, class... P, class... ViewCtorArgs> +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class T, class... P, class... ViewCtorArgs, + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> inline auto create_mirror( const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, const Kokkos::Experimental::OffsetView<T, P...>& src) { @@ -1928,76 +1375,56 @@ inline auto create_mirror( } namespace Impl { -template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - (std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { - return src; -} +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - !(std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), - typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> -create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::create_mirror(arg_prop, src); -} - -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -std::enable_if_t<Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { - return src; +inline auto create_mirror_view( + const Kokkos::Experimental::OffsetView<T, P...>& src, + [[maybe_unused]] const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { + if constexpr (!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + if constexpr (std::is_same_v<typename Kokkos::Experimental::OffsetView< + T, P...>::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space> && + std::is_same_v<typename Kokkos::Experimental::OffsetView< + T, P...>::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>) { + return + typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorOffsetViewType<typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + return typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -std::enable_if_t<!Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorOffsetViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} } // namespace Impl -// Create a mirror view in host space +// public interface template <class T, class... P> inline auto create_mirror_view( const typename Kokkos::Experimental::OffsetView<T, P...>& src) { return Impl::create_mirror_view(src, Impl::ViewCtorProp<>{}); } +// public interface that accepts a without initializing flag template <class T, class... P> inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, @@ -2005,7 +1432,7 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, Kokkos::view_alloc(wi)); } -// Create a mirror view in a new space +// public interface that accepts a space template <class Space, class T, class... P, typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror_view( @@ -2014,7 +1441,9 @@ inline auto create_mirror_view( src, Kokkos::view_alloc(typename Space::memory_space{})); } -template <class Space, class T, class... P> +// public interface that accepts a space and a without initializing flag +template <class Space, class T, class... P, + typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror_view( Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) { @@ -2022,6 +1451,8 @@ inline auto create_mirror_view( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> inline auto create_mirror_view( const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, @@ -2029,7 +1460,9 @@ inline auto create_mirror_view( return Impl::create_mirror_view(src, arg_prop); } -// Create a mirror view and deep_copy in a new space +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class... ViewCtorArgs, class T, class... P> typename Kokkos::Impl::MirrorOffsetViewType< typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp index 9d04cf6acd0edc12b28c7c759512c8f2f3c0b526..52af567c61d20b600f8a7a87c8638a0beabe028f 100644 --- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -184,16 +184,16 @@ struct DefaultContribution<Kokkos::HIP, #ifdef KOKKOS_ENABLE_SYCL template <> -struct DefaultDuplication<Kokkos::Experimental::SYCL> { +struct DefaultDuplication<Kokkos::SYCL> { using type = Kokkos::Experimental::ScatterNonDuplicated; }; template <> -struct DefaultContribution<Kokkos::Experimental::SYCL, +struct DefaultContribution<Kokkos::SYCL, Kokkos::Experimental::ScatterNonDuplicated> { using type = Kokkos::Experimental::ScatterAtomic; }; template <> -struct DefaultContribution<Kokkos::Experimental::SYCL, +struct DefaultContribution<Kokkos::SYCL, Kokkos::Experimental::ScatterDuplicated> { using type = Kokkos::Experimental::ScatterAtomic; }; @@ -532,32 +532,56 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template <typename Layout, int rank, typename V, typename... Args> struct Slice { - using next = Slice<Layout, rank - 1, V, Kokkos::ALL_t, Args...>; - using value_type = typename next::value_type; - - static value_type get(V const& src, const size_t i, Args... args) { + using next = Slice<Layout, rank - 1, V, Kokkos::ALL_t, Args...>; + static auto get(V const& src, const size_t i, Args... args) { return next::get(src, i, Kokkos::ALL, args...); } }; template <typename V, typename... Args> struct Slice<Kokkos::LayoutRight, 1, V, Args...> { - using value_type = - typename Kokkos::Impl::ViewMapping<void, V, const size_t, Args...>::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, i, args...); } }; template <typename V, typename... Args> struct Slice<Kokkos::LayoutLeft, 1, V, Args...> { - using value_type = - typename Kokkos::Impl::ViewMapping<void, V, Args..., const size_t>::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +template <typename V, typename... Args> +struct Slice<Kokkos::layout_right, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template <typename V, typename... Args> +struct Slice<Kokkos::layout_left, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, args..., i); } }; +template <size_t Pad, typename V, typename... Args> +struct Slice<Kokkos::Experimental::layout_right_padded<Pad>, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template <size_t Pad, typename V, typename... Args> +struct Slice<Kokkos::Experimental::layout_left_padded<Pad>, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; +#endif + template <typename ExecSpace, typename ValueType, typename Op> struct ReduceDuplicates; @@ -905,7 +929,7 @@ class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterNonDuplicated, template <typename Arg> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral<Arg>::value, + std::is_integral_v<Arg> && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(arg); @@ -1028,10 +1052,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op, *this); } - typename Kokkos::Impl::Experimental::Slice<Kokkos::LayoutRight, - internal_view_type::rank, - internal_view_type>::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1233,8 +1254,8 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op, arg_N[internal_view_type::rank - 1] = unique_token.size(); internal_view = internal_view_type( view_alloc(WithoutInitializing, - std::string("duplicated_") + original_view.label(), - exec_space), + std::string("duplicated_") + original_view.label(), + exec_space), arg_N[0], arg_N[1], arg_N[2], arg_N[3], arg_N[4], arg_N[5], arg_N[6], arg_N[7]); reset(exec_space); @@ -1310,10 +1331,7 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op, *this); } - typename Kokkos::Impl::Experimental::Slice<Kokkos::LayoutLeft, - internal_view_type::rank, - internal_view_type>::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1460,7 +1478,7 @@ class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterDuplicated, template <typename Arg> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral<Arg>::value, + std::is_integral_v<Arg> && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(thread_id, arg); @@ -1470,9 +1488,9 @@ class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterDuplicated, view_type const& view; // simplify RAII by disallowing copies - ScatterAccess(ScatterAccess const& other) = delete; + ScatterAccess(ScatterAccess const& other) = delete; ScatterAccess& operator=(ScatterAccess const& other) = delete; - ScatterAccess& operator=(ScatterAccess&& other) = delete; + ScatterAccess& operator=(ScatterAccess&& other) = delete; public: // do need to allow moves though, for the common @@ -1497,16 +1515,16 @@ ScatterView< RT, typename ViewTraits<RT, RP...>::array_layout, typename ViewTraits<RT, RP...>::device_type, Op, std::conditional_t< - std::is_void<Duplication>::value, + std::is_void_v<Duplication>, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits<RT, RP...>::execution_space>::type, Duplication>, std::conditional_t< - std::is_void<Contribution>::value, + std::is_void_v<Contribution>, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits<RT, RP...>::execution_space, typename std::conditional_t< - std::is_void<Duplication>::value, + std::is_void_v<Duplication>, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits<RT, RP...>::execution_space>::type, Duplication>>::type, diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 8ce868cac2172b417d2bde8e1f18bc390ed78fc1..ec1b8905c766c0b7e0311fd2aabff537dc659302 100644 --- a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -190,7 +190,7 @@ struct GraphRowViewConst { const typename GraphType::entries_type& colidx_in, const ordinal_type& stride, const ordinal_type& count, const OffsetType& idx, - const std::enable_if_t<std::is_integral<OffsetType>::value, int>& = 0) + const std::enable_if_t<std::is_integral_v<OffsetType>, int>& = 0) : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} /// \brief Number of entries in the row. diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp index e001c062de3a38ec8e9c918276352b3475832909..4f47051a5c1c15c8a4408c835c8794c3241be6be 100644 --- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -34,7 +34,7 @@ #include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_UnorderedMap_impl.hpp> -#include <impl/Kokkos_ViewCtor.hpp> +#include <View/Kokkos_ViewCtor.hpp> #include <cstdint> @@ -243,16 +243,16 @@ class UnorderedMap { using const_map_type = UnorderedMap<const_key_type, const_value_type, device_type, hasher_type, equal_to_type>; - static const bool is_set = std::is_void<value_type>::value; - static const bool has_const_key = - std::is_same<const_key_type, declared_key_type>::value; - static const bool has_const_value = - is_set || std::is_same<const_value_type, declared_value_type>::value; + static constexpr bool is_set = std::is_void_v<value_type>; + static constexpr bool has_const_key = + std::is_same_v<const_key_type, declared_key_type>; + static constexpr bool has_const_value = + is_set || std::is_same_v<const_value_type, declared_value_type>; - static const bool is_insertable_map = + static constexpr bool is_insertable_map = !has_const_key && (is_set || !has_const_value); - static const bool is_modifiable_map = has_const_key && !has_const_value; - static const bool is_const_map = has_const_key && has_const_value; + static constexpr bool is_modifiable_map = has_const_key && !has_const_value; + static constexpr bool is_const_map = has_const_key && has_const_value; using insert_result = UnorderedMapInsertResult; @@ -337,27 +337,27 @@ class UnorderedMap { Impl::get_property<Impl::LabelTag>(prop_copy) + " - size")); m_available_indexes = - bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"), + bitset_type(Kokkos::Impl::append_to_label(prop_copy, " - bitset"), calculate_capacity(capacity_hint)); m_hash_lists = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - hash list"), Impl::find_hash_size(capacity())); m_next_index = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - next index"), capacity() + 1); // +1 so that the *_at functions can always return a // valid reference - m_keys = key_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity()); + m_keys = key_type_view(Kokkos::Impl::append_to_label(prop_copy, " - keys"), + capacity()); - m_values = value_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - values"), - is_set ? 0 : capacity()); + m_values = + value_type_view(Kokkos::Impl::append_to_label(prop_copy, " - values"), + is_set ? 0 : capacity()); m_scalars = - scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars")); + scalars_view(Kokkos::Impl::append_to_label(prop_copy, " - scalars")); /** * Deep copies should also be done using the space instance if given. @@ -746,7 +746,7 @@ class UnorderedMap { /// 'const value_type' via Cuda texture fetch must return by value. template <typename Dummy = value_type> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_void<Dummy>::value, // !is_set + !std::is_void_v<Dummy>, // !is_set std::conditional_t<has_const_value, impl_value_type, impl_value_type &>> value_at(size_type i) const { KOKKOS_EXPECTS(i < capacity()); @@ -805,56 +805,94 @@ class UnorderedMap { return *this; } + // Re-allocate the views of the calling UnorderedMap according to src + // capacity, and deep copy the src data. template <typename SKey, typename SValue, typename SDevice> - std::enable_if_t<std::is_same<std::remove_const_t<SKey>, key_type>::value && - std::is_same<std::remove_const_t<SValue>, value_type>::value> + std::enable_if_t<std::is_same_v<std::remove_const_t<SKey>, key_type> && + std::is_same_v<std::remove_const_t<SValue>, value_type>> create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher, EqualTo> const &src) { if (m_hash_lists.data() != src.m_hash_lists.data()) { - insertable_map_type tmp; - - tmp.m_bounded_insert = src.m_bounded_insert; - tmp.m_hasher = src.m_hasher; - tmp.m_equal_to = src.m_equal_to; - tmp.m_size() = src.m_size(); - tmp.m_available_indexes = bitset_type(src.capacity()); - tmp.m_hash_lists = size_type_view( - view_alloc(WithoutInitializing, "UnorderedMap hash list"), - src.m_hash_lists.extent(0)); - tmp.m_next_index = size_type_view( - view_alloc(WithoutInitializing, "UnorderedMap next index"), - src.m_next_index.extent(0)); - tmp.m_keys = - key_type_view(view_alloc(WithoutInitializing, "UnorderedMap keys"), - src.m_keys.extent(0)); - tmp.m_values = value_type_view( - view_alloc(WithoutInitializing, "UnorderedMap values"), - src.m_values.extent(0)); - tmp.m_scalars = scalars_view("UnorderedMap scalars"); - - Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes); + allocate_view(src); + deep_copy_view(src); + } + } + + // Allocate views of the calling UnorderedMap with the same capacity as the + // src. + template <typename SKey, typename SValue, typename SDevice> + std::enable_if_t<std::is_same_v<std::remove_const_t<SKey>, key_type> && + std::is_same_v<std::remove_const_t<SValue>, value_type>> + allocate_view( + UnorderedMap<SKey, SValue, SDevice, Hasher, EqualTo> const &src) { + insertable_map_type tmp; + + tmp.m_bounded_insert = src.m_bounded_insert; + tmp.m_hasher = src.m_hasher; + tmp.m_equal_to = src.m_equal_to; + tmp.m_size() = src.m_size(); + tmp.m_available_indexes = bitset_type(src.capacity()); + tmp.m_hash_lists = size_type_view( + view_alloc(WithoutInitializing, "UnorderedMap hash list"), + src.m_hash_lists.extent(0)); + tmp.m_next_index = size_type_view( + view_alloc(WithoutInitializing, "UnorderedMap next index"), + src.m_next_index.extent(0)); + tmp.m_keys = + key_type_view(view_alloc(WithoutInitializing, "UnorderedMap keys"), + src.m_keys.extent(0)); + tmp.m_values = + value_type_view(view_alloc(WithoutInitializing, "UnorderedMap values"), + src.m_values.extent(0)); + tmp.m_scalars = scalars_view("UnorderedMap scalars"); + + *this = tmp; + } + + // Deep copy view data from src. This requires that the src capacity is + // identical to the capacity of the calling UnorderedMap. + template <typename SKey, typename SValue, typename SDevice> + std::enable_if_t<std::is_same_v<std::remove_const_t<SKey>, key_type> && + std::is_same_v<std::remove_const_t<SValue>, value_type>> + deep_copy_view( + UnorderedMap<SKey, SValue, SDevice, Hasher, EqualTo> const &src) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // To deep copy UnorderedMap, capacity must be identical + KOKKOS_EXPECTS(capacity() == src.capacity()); +#else + if (capacity() != src.capacity()) { + allocate_view(src); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning( + "Warning: deep_copy_view() allocating views is deprecated. Must call " + "with UnorderedMaps of identical capacity, or use " + "create_copy_view().\n"); +#endif + } +#endif + + if (m_hash_lists.data() != src.m_hash_lists.data()) { + Kokkos::deep_copy(m_available_indexes, src.m_available_indexes); using raw_deep_copy = Kokkos::Impl::DeepCopy<typename device_type::memory_space, typename SDevice::memory_space>; - raw_deep_copy(tmp.m_hash_lists.data(), src.m_hash_lists.data(), + raw_deep_copy(m_hash_lists.data(), src.m_hash_lists.data(), sizeof(size_type) * src.m_hash_lists.extent(0)); - raw_deep_copy(tmp.m_next_index.data(), src.m_next_index.data(), + raw_deep_copy(m_next_index.data(), src.m_next_index.data(), sizeof(size_type) * src.m_next_index.extent(0)); - raw_deep_copy(tmp.m_keys.data(), src.m_keys.data(), + raw_deep_copy(m_keys.data(), src.m_keys.data(), sizeof(key_type) * src.m_keys.extent(0)); if (!is_set) { - raw_deep_copy(tmp.m_values.data(), src.m_values.data(), + raw_deep_copy(m_values.data(), src.m_values.data(), sizeof(impl_value_type) * src.m_values.extent(0)); } - raw_deep_copy(tmp.m_scalars.data(), src.m_scalars.data(), + raw_deep_copy(m_scalars.data(), src.m_scalars.data(), sizeof(int) * num_scalars); Kokkos::fence( - "Kokkos::UnorderedMap::create_copy_view: fence after copy to tmp"); - - *this = tmp; + "Kokkos::UnorderedMap::deep_copy_view: fence after copy to dst."); } } @@ -932,13 +970,25 @@ class UnorderedMap { friend struct Impl::UnorderedMapPrint; }; -// Specialization of deep_copy for two UnorderedMap objects. +// Specialization of deep_copy() for two UnorderedMap objects. template <typename DKey, typename DT, typename DDevice, typename SKey, typename ST, typename SDevice, typename Hasher, typename EqualTo> inline void deep_copy( UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> &dst, const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> &src) { - dst.create_copy_view(src); + dst.deep_copy_view(src); +} + +// Specialization of create_mirror() for an UnorderedMap object. +template <typename Key, typename ValueType, typename Device, typename Hasher, + typename EqualTo> +typename UnorderedMap<Key, ValueType, Device, Hasher, EqualTo>::HostMirror +create_mirror( + const UnorderedMap<Key, ValueType, Device, Hasher, EqualTo> &src) { + typename UnorderedMap<Key, ValueType, Device, Hasher, EqualTo>::HostMirror + dst; + dst.allocate_view(src); + return dst; } } // namespace Kokkos diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp index 88109fb0ba56ac4afe35e837b146ce152342792d..83ccfbf6305f409ededb8a78ef2c767b4567dee9 100644 --- a/packages/kokkos/containers/src/Kokkos_Vector.hpp +++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp @@ -172,9 +172,8 @@ class KOKKOS_DEPRECATED vector private: template <class T> - struct impl_is_input_iterator - : /* TODO replace this */ std::bool_constant< - !std::is_convertible<T, size_type>::value> {}; + struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< + !std::is_convertible_v<T, size_type>> {}; public: // TODO: can use detection idiom to generate better error message here later diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index 8f8cd9523b726379c62a4e2cd7f5d096988f8b13..a979ee40d8c88a194dcace0249b336d7f0dfa9e6 100644 --- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -27,6 +27,18 @@ namespace Kokkos { namespace Impl { +//! Append to the label contained in view_ctor_prop. +template <typename... P> +auto append_to_label(const ViewCtorProp<P...>& view_ctor_prop, + const std::string& label) { + using vcp_t = ViewCtorProp<P...>; + static_assert(vcp_t::has_label); + vcp_t new_ctor_props(view_ctor_prop); + static_cast<ViewCtorProp<void, std::string>&>(new_ctor_props) + .value.append(label); + return new_ctor_props; +} + uint32_t find_hash_size(uint32_t size); template <typename Map> diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt index e69e46bb6a85c04bbfa4eda0cbd78ee96ee2fe51..6255a86c4614f9d4142c8151d6551bfdeb36e332 100644 --- a/packages/kokkos/containers/unit_tests/CMakeLists.txt +++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,8 +1,7 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -12,57 +11,49 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name - Bitset - DualView - DynamicView - DynViewAPI_generic - DynViewAPI_rank12345 - DynViewAPI_rank67 - ErrorReporter - OffsetView - ScatterView - StaticCrsGraph - WithoutInitializing - UnorderedMap - Vector - ViewCtorPropEmbeddedDim - ) + foreach( + Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + DynRankView_TeamScratch + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + WithoutInitializing + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector") continue() # skip Kokkos::vector test if deprecated code 4 is not enabled endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include <Test${Tag}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() #fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj if(KOKKOS_ENABLE_CUDA AND WIN32) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) + kokkos_add_executable_and_test(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() -SET(COMPILE_ONLY_SOURCES - TestCreateMirror.cpp - TestDualViewParameterPack.cpp - TestIsViewTrait.cpp -) -KOKKOS_ADD_EXECUTABLE( - ContainersTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} +set(COMPILE_ONLY_SOURCES TestCreateMirror.cpp TestDualViewParameterPack.cpp TestIsViewTrait.cpp + TestDynRankViewTypedefs.cpp ) +kokkos_add_executable(ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile index 2e35832cc8916e998b075ee2f85a7e79a36d7ccf..18410882bca9acd1f4e3c0c53ae294ea5003c07d 100644 --- a/packages/kokkos/containers/unit_tests/Makefile +++ b/packages/kokkos/containers/unit_tests/Makefile @@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include<Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include<Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include<Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include<Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ )\ ) \ ) diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp index 3ad0d2bf573431868464f80fe1af413c4b9da8db..91dc1710e5f8487c2559f290730f86844314dbc0 100644 --- a/packages/kokkos/containers/unit_tests/TestBitset.hpp +++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp @@ -23,6 +23,8 @@ #include <Kokkos_Bitset.hpp> #include <array> +#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> + namespace Test { namespace Impl { @@ -37,7 +39,7 @@ struct TestBitset { TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} - unsigned testit(unsigned collisions) { + unsigned testit(unsigned long long collisions) { execution_space().fence(); unsigned count = 0; @@ -155,7 +157,7 @@ void test_bitset() { { unsigned ts = 100u; - bitset_type b1; + bitset_type b1(Kokkos::view_alloc("MyBitset"), 0); ASSERT_TRUE(b1.is_allocated()); b1 = bitset_type(ts); @@ -165,6 +167,9 @@ void test_bitset() { ASSERT_TRUE(b1.is_allocated()); ASSERT_TRUE(b2.is_allocated()); ASSERT_TRUE(b3.is_allocated()); + + bitset_type b4; + ASSERT_FALSE(b4.is_allocated()); } std::array<unsigned, 7> test_sizes = { @@ -237,6 +242,24 @@ void test_bitset() { } TEST(TEST_CATEGORY, bitset) { test_bitset<TEST_EXECSPACE>(); } + +TEST(TEST_CATEGORY, bitset_default_constructor_no_alloc) { + using namespace Kokkos::Test::Tools; + listen_tool_events(Config::DisableAll(), Config::EnableAllocs()); + + auto success = validate_absence( + [&]() { + Kokkos::Bitset bs; + EXPECT_FALSE(bs.is_allocated()); + }, + [&](AllocateDataEvent) { + return MatchDiagnostic{true, {"Found alloc event"}}; + }); + ASSERT_TRUE(success); + + listen_tool_events(Config::DisableAll()); +} + } // namespace Test #endif // KOKKOS_TEST_BITSET_HPP diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp index a15e5fa299726e80d7d6654e3ea28e801cd5f4c5..5d03e6202a896b104d4097da6a04d0a33fa79948 100644 --- a/packages/kokkos/containers/unit_tests/TestDualView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp @@ -55,8 +55,8 @@ struct test_dualview_alloc { bool result = false; test_dualview_alloc(unsigned int size) { - result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >( - size, 3); + result = + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(size, 3); } }; @@ -71,7 +71,7 @@ struct test_dualview_copy_construction_and_assignment { using SrcViewType = Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>; using DstViewType = - Kokkos::DualView<const Scalar * [m], Kokkos::LayoutLeft, Device>; + Kokkos::DualView<const Scalar* [m], Kokkos::LayoutLeft, Device>; SrcViewType a("A", n, m); @@ -154,7 +154,7 @@ struct test_dualview_combinations { } test_dualview_combinations(unsigned int size, bool with_init) { - result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >( + result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>( size, 3, with_init); } }; @@ -253,21 +253,18 @@ struct test_dual_view_deep_copy { } // end run_me test_dual_view_deep_copy() { - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(10, 5, - true); - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(10, 5, - false); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(10, 5, true); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(10, 5, + false); // Test zero length but allocated (a.d_view.data!=nullptr but // a.d_view.span()==0) - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(0, 5, true); - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(0, 5, - false); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(0, 5, true); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(0, 5, false); // Test default constructed view - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(-1, 5, - true); - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(-1, 5, - false); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(-1, 5, true); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(-1, 5, + false); } }; @@ -282,15 +279,20 @@ struct test_dualview_resize { const unsigned int m = 5; const unsigned int factor = 2; - ViewType a("A", n, m); + ViewType a; + if constexpr (Initialize) + a = ViewType("A", n, m); + else + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::deep_copy(a.d_view, 1); /* Covers case "Resize on Device" */ a.modify_device(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); - else + if constexpr (Initialize) Kokkos::resize(a, factor * n, factor * m); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, factor * n, factor * m); ASSERT_EQ(a.extent(0), n * factor); ASSERT_EQ(a.extent(1), m * factor); @@ -298,33 +300,38 @@ struct test_dualview_resize { a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)), - SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), - a_d_sum); + Kokkos::View<int, typename ViewType::t_dev::memory_space> errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy<t_dev_exec_space, Kokkos::Rank<2>>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, a_d_sum); - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); /* Covers case "Resize on Host" */ a.modify_host(); - if (Initialize) - Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); - else + if constexpr (Initialize) Kokkos::resize(a, n / factor, m / factor); + else + Kokkos::resize(Kokkos::WithoutInitializing, a, n / factor, m / factor); ASSERT_EQ(a.extent(0), n / factor); ASSERT_EQ(a.extent(1), m / factor); @@ -332,30 +339,33 @@ struct test_dualview_resize { a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected - a_d_sum = 0; + Kokkos::deep_copy(errors_d, 0); // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)), - SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), - a_d_sum); + Kokkos::parallel_for( + Kokkos::MDRangePolicy<t_dev_exec_space, Kokkos::Rank<2>>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - a_h_sum = 0; + errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_resize() { - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(); } }; @@ -369,40 +379,51 @@ struct test_dualview_realloc { const unsigned int n = 10; const unsigned int m = 5; - ViewType a("A", n, m); - if (Initialize) - Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); - else + ViewType a; + if constexpr (Initialize) { + a = ViewType("A", n, m); Kokkos::realloc(a, n, m); + } else { + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + Kokkos::realloc(Kokkos::WithoutInitializing, a, n, m); + } + ASSERT_EQ(a.extent(0), n); + ASSERT_EQ(a.extent(1), m); Kokkos::deep_copy(a.d_view, 1); + a.modify_device(); a.sync_host(); // Check device view is initialized as expected - scalar_type a_d_sum = 0; // Execute on the execution_space associated with t_dev's memory space using t_dev_exec_space = typename ViewType::t_dev::memory_space::execution_space; - Kokkos::parallel_reduce( - Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)), - SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), - a_d_sum); + Kokkos::View<int, typename ViewType::t_dev::memory_space> errors_d( + "errors"); + Kokkos::parallel_for( + Kokkos::MDRangePolicy<t_dev_exec_space, Kokkos::Rank<2>>( + {0, 0}, {a.d_view.extent(0), a.d_view.extent(1)}), + KOKKOS_LAMBDA(int i, int j) { + if (a.d_view(i, j) != 1) Kokkos::atomic_inc(errors_d.data()); + }); + int errors_d_scalar; + Kokkos::deep_copy(errors_d_scalar, errors_d); // Check host view is synced as expected - scalar_type a_h_sum = 0; + int errors_h_scalar = 0; for (size_t i = 0; i < a.h_view.extent(0); ++i) for (size_t j = 0; j < a.h_view.extent(1); ++j) { - a_h_sum += a.h_view(i, j); + if (a.h_view(i, j) != 1) ++errors_h_scalar; } // Check - ASSERT_EQ(a_h_sum, scalar_type(a.extent(0) * a.extent(1))); - ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(errors_d_scalar, 0); + ASSERT_EQ(errors_h_scalar, 0); } // end run_me test_dualview_realloc() { - run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device>>(); } }; @@ -463,12 +484,23 @@ TEST(TEST_CATEGORY, dualview_deep_copy) { test_dualview_deep_copy<double, TEST_EXECSPACE>(); } +struct NoDefaultConstructor { + NoDefaultConstructor(int i_) : i(i_) {} + KOKKOS_FUNCTION operator int() const { return i; } + + int i; +}; + TEST(TEST_CATEGORY, dualview_realloc) { test_dualview_realloc<int, TEST_EXECSPACE>(); + Impl::test_dualview_realloc<NoDefaultConstructor, TEST_EXECSPACE, + /* Initialize */ false>(); } TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize<int, TEST_EXECSPACE>(); + Impl::test_dualview_resize<NoDefaultConstructor, TEST_EXECSPACE, + /* Initialize */ false>(); } namespace { @@ -488,58 +520,26 @@ namespace { * that we keep the semantics of UVM DualViews intact. */ // modify if we have other UVM enabled backends -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // OR other UVM builds -#define UVM_ENABLED_BUILD -#endif - -#ifdef UVM_ENABLED_BUILD -template <typename ExecSpace> -struct UVMSpaceFor; -#endif - -#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA -template <> -struct UVMSpaceFor<Kokkos::Cuda> { - using type = Kokkos::CudaUVMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_SYCL // specific to SYCL -template <> -struct UVMSpaceFor<Kokkos::Experimental::SYCL> { - using type = Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_HIP // specific to HIP -template <> -struct UVMSpaceFor<Kokkos::HIP> { - using type = Kokkos::HIPManagedSpace; -}; -#endif -#ifdef UVM_ENABLED_BUILD -template <> -struct UVMSpaceFor<Kokkos::DefaultHostExecutionSpace> { - using type = typename UVMSpaceFor<Kokkos::DefaultExecutionSpace>::type; -}; +#ifdef KOKKOS_HAS_SHARED_SPACE +template <typename ExecutionSpace> +using TestSharedSpace = Kokkos::SharedSpace; #else -template <typename ExecSpace> -struct UVMSpaceFor { - using type = typename ExecSpace::memory_space; -}; +template <typename ExecutionSpace> +using TestSharedSpace = typename ExecutionSpace::memory_space; #endif using ExecSpace = Kokkos::DefaultExecutionSpace; -using MemSpace = typename UVMSpaceFor<Kokkos::DefaultExecutionSpace>::type; +using MemSpace = TestSharedSpace<Kokkos::DefaultExecutionSpace>; using DeviceType = Kokkos::Device<ExecSpace, MemSpace>; using DualViewType = Kokkos::DualView<double*, Kokkos::LayoutLeft, DeviceType>; -using d_device = DeviceType; -using h_device = Kokkos::Device< - Kokkos::DefaultHostExecutionSpace, - typename UVMSpaceFor<Kokkos::DefaultHostExecutionSpace>::type>; +using ConstDualViewType = + Kokkos::DualView<const double*, Kokkos::LayoutLeft, DeviceType>; +using d_device = DeviceType; +using h_device = + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, + TestSharedSpace<Kokkos::DefaultHostExecutionSpace>>; TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { DualViewType dv("myView", 100); @@ -603,14 +603,69 @@ TEST(TEST_CATEGORY, dualview_template_views_return_correct_executionspace_views) { DualViewType dv("myView", 100); dv.clear_sync_state(); - using hvt = decltype(dv.view<typename Kokkos::DefaultHostExecutionSpace>()); - using dvt = decltype(dv.view<typename Kokkos::DefaultExecutionSpace>()); + using hvt = decltype(dv.view<Kokkos::DefaultHostExecutionSpace>()); + using dvt = decltype(dv.view<Kokkos::DefaultExecutionSpace>()); ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), dvt::device_type::execution_space::name()); ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), hvt::device_type::execution_space::name()); } +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_views_from_const_dual_view) { + DualViewType dv("myView", 100); + ConstDualViewType const_dv = dv; + dv.clear_sync_state(); + ASSERT_EQ(dv.view<Kokkos::DefaultHostExecutionSpace>(), + const_dv.view<Kokkos::DefaultHostExecutionSpace>()); + ASSERT_EQ(dv.view<Kokkos::DefaultExecutionSpace>(), + const_dv.view<Kokkos::DefaultExecutionSpace>()); +} + +// User-defined types with a View data member, only host-constructible +template <class V> +class S { + V v_; + + public: + template <class... Extents> + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() : v_("v", 10) {} +}; + +template <typename V> +auto initialize_view_of_views() { + Kokkos::DualView<V*, TEST_EXECSPACE> dv_v( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 3u); + + V v("v", 2); + V w("w", 2); + dv_v.h_view(0) = v; + dv_v.h_view(1) = w; + + dv_v.modify_host(); + dv_v.sync_device(); + + return dv_v; +} + +TEST(TEST_CATEGORY, dualview_sequential_host_init) { + auto dv_v = initialize_view_of_views<Kokkos::View<double*, TEST_EXECSPACE>>(); + dv_v.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv_v.d_view.size(), 2u); + ASSERT_EQ(dv_v.h_view.size(), 2u); + + initialize_view_of_views<S<Kokkos::View<double*, TEST_EXECSPACE>>>(); + + Kokkos::DualView<double*> dv( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 1u); + dv.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv.d_view.size(), 2u); + ASSERT_EQ(dv.h_view.size(), 2u); + dv.realloc(Kokkos::view_alloc(Kokkos::SequentialHostInit), 3u); + ASSERT_EQ(dv.d_view.size(), 3u); + ASSERT_EQ(dv.h_view.size(), 3u); +} } // anonymous namespace } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp new file mode 100644 index 0000000000000000000000000000000000000000..95117a22e6e8c42baf391bf3ab9519f18e31d6ba --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include <Kokkos_DynRankView.hpp> + +namespace { + +// clang-format off +template<class DataType> +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t<DataType>; +}; + +template<class DataType> +struct data_analysis<DataType*> { + using data_type = typename data_analysis<DataType>::data_type*; + using const_data_type = typename data_analysis<DataType>::const_data_type*; + using runtime_data_type = typename data_analysis<DataType>::runtime_data_type*; + using runtime_const_data_type = typename data_analysis<DataType>::runtime_const_data_type*; + using non_const_data_type = typename data_analysis<DataType>::non_const_data_type*; +}; + +template<class DataType, size_t N> +struct data_analysis<DataType[N]> { + using data_type = typename data_analysis<DataType>::data_type[N]; + using const_data_type = typename data_analysis<DataType>::const_data_type[N]; + using runtime_data_type = typename data_analysis<DataType>::runtime_data_type*; + using runtime_const_data_type = typename data_analysis<DataType>::runtime_const_data_type*; + using non_const_data_type = typename data_analysis<DataType>::non_const_data_type[N]; +}; + +template<class ViewType, class ViewTraitsType, class DataType, class Layout, class Space, class MemoryTraitsType, + class HostMirrorSpace, class ValueType, class ReferenceType> +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v<typename ViewType::data_type, DataType>); + static_assert(std::is_same_v<typename ViewType::const_data_type, typename data_analysis<DataType>::const_data_type>); + static_assert(std::is_same_v<typename ViewType::non_const_data_type, typename data_analysis<DataType>::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v<typename ViewType::scalar_array_type, DataType>); + static_assert(std::is_same_v<typename ViewType::const_scalar_array_type, typename data_analysis<DataType>::const_data_type>); + static_assert(std::is_same_v<typename ViewType::non_const_scalar_array_type, typename data_analysis<DataType>::non_const_data_type>); + static_assert(std::is_same_v<typename ViewType::specialize, void>); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v<typename ViewType::value_type, ValueType>); + static_assert(std::is_same_v<typename ViewType::const_value_type, const ValueType>); + static_assert(std::is_same_v<typename ViewType::non_const_value_type, std::remove_const_t<ValueType>>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v<typename ViewType::array_layout, Layout>); + + // FIXME: should be deprecated and is some complicated impl type + // static_assert(!std::is_void_v<typename ViewType::dimension>); + + static_assert(std::is_same_v<typename ViewType::execution_space, typename Space::execution_space>); + static_assert(std::is_same_v<typename ViewType::memory_space, typename Space::memory_space>); + static_assert(std::is_same_v<typename ViewType::device_type, Kokkos::Device<typename ViewType::execution_space, typename ViewType::memory_space>>); + static_assert(std::is_same_v<typename ViewType::memory_traits, MemoryTraitsType>); + static_assert(std::is_same_v<typename ViewType::host_mirror_space, HostMirrorSpace>); + static_assert(std::is_same_v<typename ViewType::size_type, typename ViewType::memory_space::size_type>); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v<typename ViewType::reference_type, ReferenceType>); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v<typename ViewType::pointer_type, ValueType*>); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + + // FIXME: in contrast to View, hooks_policy is not propagated + static_assert(std::is_same_v<typename ViewType::traits, ViewTraitsType>); + static_assert(std::is_same_v<typename ViewType::array_type, + Kokkos::DynRankView<typename ViewType::data_type, typename ViewType::array_layout, + typename ViewType::device_type, //typename ViewTraitsType::hooks_policy, + typename ViewType::memory_traits>>); + static_assert(std::is_same_v<typename ViewType::const_type, + Kokkos::DynRankView<typename ViewType::const_data_type, typename ViewType::array_layout, + typename ViewType::device_type, //typename ViewTraitsType::hooks_policy, + typename ViewType::memory_traits>>); + static_assert(std::is_same_v<typename ViewType::non_const_type, + Kokkos::DynRankView<typename ViewType::non_const_data_type, typename ViewType::array_layout, + typename ViewType::device_type, //typename ViewTraitsType::hooks_policy, + typename ViewType::memory_traits>>); + static_assert(std::is_same_v<typename ViewType::HostMirror, + Kokkos::DynRankView<typename ViewType::non_const_data_type, typename ViewType::array_layout, + HostMirrorSpace + /*, typename ViewTraitsType::hooks_policy*/>>); + +/* FIXME: these don't exist in DynRankView, should they? + using uniform_layout_type = std::conditional_t<ViewType::rank()==0 || (ViewType::rank()==0 && + std::is_same_v<Layout, Kokkos::LayoutRight>), + Kokkos::LayoutLeft, Layout>; + + // Uhm uniformtype removes all memorytraits? + static_assert(std::is_same_v<typename ViewType::uniform_type, + Kokkos::DynRankView<typename ViewType::data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_const_type, + Kokkos::DynRankView<typename ViewType::const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_type, + Kokkos::DynRankView<typename data_analysis<DataType>::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_const_type, + Kokkos::DynRankView<typename data_analysis<DataType>::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device<typename ViewType::execution_space, Kokkos::AnonymousSpace>; + static_assert(std::is_same_v<typename ViewType::uniform_nomemspace_type, + Kokkos::DynRankView<typename ViewType::data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_const_nomemspace_type, + Kokkos::DynRankView<typename ViewType::const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_nomemspace_type, + Kokkos::DynRankView<typename data_analysis<DataType>::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_const_nomemspace_type, + Kokkos::DynRankView<typename data_analysis<DataType>::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); +*/ + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v<typename ViewType::layout_type, Layout>); + // FIXME: Not supported yet + // static_assert(std::is_same_v<typename ViewType::extents_type, >); + // static_assert(std::is_same_v<typename ViewType::mapping_type, >); + // static_assert(std::is_same_v<typename ViewType::accessor_type, >); + + static_assert(std::is_same_v<typename ViewType::element_type, ValueType>); + // FIXME: should be remove_const_t<element_type> + static_assert(std::is_same_v<typename ViewType::value_type, ValueType>); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v<typename ViewType::index_type, typename Space::memory_space::size_type>); + static_assert(std::is_same_v<typename ViewType::rank_type, size_t>); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v<typename ViewType::data_handle_type, typename ViewType::pointer_type>); + static_assert(std::is_same_v<typename ViewType::reference, typename ViewType::reference_type>); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template<class T, class ... ViewArgs> +struct ViewParams {}; + +template<class L, class S, class M, class HostMirrorSpace, class ValueType, class ReferenceType, class T, class ... ViewArgs> +constexpr bool test_view_typedefs(ViewParams<T, ViewArgs...>) { + return test_view_typedefs_impl<Kokkos::DynRankView<T, ViewArgs...>, Kokkos::ViewTraits<T*******, ViewArgs...>, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::DefaultHostExecutionSpace>; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View<int> +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::DefaultExecutionSpace, + // otherwise if unified memory is not on its HostSpace + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following Device type + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, typename Kokkos::DefaultExecutionSpace::memory_space>>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, int, int&>( + ViewParams<int>{})); +} + +// Kokkos::View<int, DefaultExecutionSpace> +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View<int> ...) + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::HostSpace, + // otherwise if unified memory is not on its also HostSpace! + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following memory space ... + Kokkos::DefaultExecutionSpace::memory_space>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, int, int&>( + ViewParams<int, Kokkos::DefaultExecutionSpace>{})); +} + +// Kokkos::View<const float, Kokkos::HostSpace> +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, const float, const float&>( + ViewParams<const float, Kokkos::HostSpace>{})); +} + +// Kokkos::View<float, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, float, float&>( + ViewParams<float, Kokkos::LayoutRight, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>>{})); +} + +// Kokkos::View<int, Kokkos::MemoryTraits<Kokkos::Atomic>> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<Kokkos::Atomic>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::DefaultExecutionSpace, + // otherwise if unified memory is not on its HostSpace + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following Device type + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, typename Kokkos::DefaultExecutionSpace::memory_space>>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, int, + Kokkos::Impl::AtomicDataElement<Kokkos::ViewTraits<int*******, Kokkos::MemoryTraits<Kokkos::Atomic>>>>( + ViewParams<int, Kokkos::MemoryTraits<Kokkos::Atomic>>{})); +} +// clang-format on +} // namespace diff --git a/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp b/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e5f8860de76ce4b0d43e1054033beab6d6772dec --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_DynRankView.hpp> + +namespace { + +void test_dyn_rank_view_team_scratch() { + using execution_space = TEST_EXECSPACE; + using memory_space = execution_space::scratch_memory_space; + using drv_type = Kokkos::DynRankView<int, memory_space>; + using policy_type = Kokkos::TeamPolicy<execution_space>; + using team_type = policy_type::member_type; + + int N0 = 10, N1 = 4, N2 = 3; + size_t shmem_size = drv_type::shmem_size(N0, N1, N2); + ASSERT_GE(shmem_size, N0 * N1 * N2 * sizeof(int)); + + Kokkos::View<unsigned, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic>> + errors("errors"); + auto policy = policy_type(1, Kokkos::AUTO) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_type& team) { + drv_type scr(team.team_scratch(0), N0, N1, N2); + // Control that the code ran at all + if (scr.rank() != 3) errors() |= 1u; + if (scr.extent_int(0) != N0) errors() |= 2u; + if (scr.extent_int(1) != N1) errors() |= 4u; + if (scr.extent_int(2) != N2) errors() |= 8u; + Kokkos::parallel_for( + Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { scr(i, j, k) = i * 100 + j * 10 + k; }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { + if (scr(i, j, k) != i * 100 + j * 10 + k) + errors() |= 16u; + }); + errors() |= 256u; + }); + unsigned h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + + ASSERT_EQ((h_errors & 1u), 0u) << "Rank mismatch"; + ASSERT_EQ((h_errors & 2u), 0u) << "extent 0 mismatch"; + ASSERT_EQ((h_errors & 4u), 0u) << "extent 1 mismatch"; + ASSERT_EQ((h_errors & 8u), 0u) << "extent 2 mismatch"; + ASSERT_EQ((h_errors & 16u), 0u) << "data access incorrect"; + ASSERT_EQ(h_errors, 256u); +} + +TEST(TEST_CATEGORY, dyn_rank_view_team_scratch) { + test_dyn_rank_view_team_scratch(); +} + +} // namespace diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 4ecb6cf25cc5bb1cf15746874821da1bd8ba4611..930c76c32c477e1b369a0a561d727299a4d22705 100644 --- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -792,9 +792,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, typename device::memory_space>::value - ? 1 - : 0; + std::is_same_v<Kokkos::HostSpace, typename device::memory_space> ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -817,9 +816,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, typename device::memory_space>::value - ? 1 - : 0; + std::is_same_v<Kokkos::HostSpace, typename device::memory_space> ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -846,9 +844,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, typename device::memory_space>::value - ? 1 - : 0; + std::is_same_v<Kokkos::HostSpace, typename device::memory_space> ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -879,8 +876,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, - typename DeviceType::memory_space>::value + std::is_same_v<Kokkos::HostSpace, typename DeviceType::memory_space> ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -915,8 +911,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, - typename DeviceType::memory_space>::value + std::is_same_v<Kokkos::HostSpace, typename DeviceType::memory_space> ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -943,8 +938,6 @@ class TestDynViewAPI { dView0 d("d"); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - // Rank 0 Kokkos::resize(d); @@ -1121,8 +1114,6 @@ class TestDynViewAPI { Kokkos::deep_copy(error_flag_host, error_flag); ASSERT_EQ(error_flag_host(), 0); #endif // MDRangePolict Rank < 7 - -#endif // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) } static void run_test_scalar() { diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp index c8f8fed3b8b36a3108f3bcb3df075ab7ab79d6d9..94ccea86eb9b6a551aac1a5e21a26a092e7f8d8a 100644 --- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -71,7 +71,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -85,7 +84,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -93,7 +91,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -108,7 +105,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -123,7 +119,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -137,7 +132,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -145,7 +139,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -160,7 +153,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -175,7 +167,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -189,14 +180,12 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // remove the final 3/4 entries i.e. first 1/4 remain unsigned da_resize = arg_total_size / 8; da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -210,7 +199,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Reproducer to demonstrate compile-time error of deep_copy @@ -229,7 +217,6 @@ struct TestDynamicView { device_dynamic_view.resize_serial(da_size); // Use parallel_for to populate device_dynamic_view and verify values -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); @@ -243,7 +230,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // Use an on-device View as intermediate to deep_copy the // device_dynamic_view to host, zero out the device_dynamic_view, @@ -251,13 +237,11 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, device_dynamic_view); Kokkos::deep_copy(host_view, device_view); Kokkos::deep_copy(device_view, host_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); -#endif Kokkos::deep_copy(device_dynamic_view, device_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, da_size), @@ -267,21 +251,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif - - // Try to deep_copy device_dynamic_view directly to/from host. - // host-to-device currently fails to compile because DP and SP are - // swapped in the deep_copy implementation. - // Once that's fixed, both deep_copy's will fail at runtime because the - // destination execution space cannot access the source memory space. - // Check if the memory spaces are different before testing the deep_copy. - if (!Kokkos::SpaceAccessibility<Kokkos::HostSpace, - memory_space>::accessible) { - ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view), - std::runtime_error); - ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view), - std::runtime_error); - } } } }; diff --git a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp index 0003a29468c5b430e863f27a52e2ec4379ba3c67..4ebab889c78fcc6c7ccd7d3feab0be21f5f129a1 100644 --- a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp +++ b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType> { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template <typename DeviceType> struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType> { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP @@ -205,8 +203,7 @@ struct ErrorReporterDriverNativeOpenMP // FIXME_MSVC MSVC just gets confused when using the base class in the // KOKKOS_CLASS_LAMBDA -#if !defined(KOKKOS_COMPILER_MSVC) && \ - (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +#ifndef KOKKOS_COMPILER_MSVC TEST(TEST_CATEGORY, ErrorReporterViaLambda) { TestErrorReporter<ErrorReporterDriverUseLambda<TEST_EXECSPACE>>(); } diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp index c133922e3defb049d50ee8b13cc74229d83f98d6..706b40fff3861787b55a5fcbfa76aaad484257e2 100644 --- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -56,7 +56,18 @@ void test_offsetview_construction() { offset_view_type ov("firstOV", range0, range1); ASSERT_EQ("firstOV", ov.label()); - ASSERT_EQ(2, ov.Rank); + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_EQ(2u, ov.Rank); +#endif +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + + ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); ASSERT_EQ(ov.end(0), 4); @@ -67,7 +78,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView<Scalar*, Device> offsetV1("OneDOffsetView", range0); @@ -149,7 +159,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +193,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +215,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +239,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +248,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +256,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +265,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -329,46 +332,131 @@ void test_offsetview_unmanaged_construction() { ASSERT_EQ(bb, ib); ASSERT_EQ(bb, ii); } +} + +template <typename Scalar, typename Device> +void test_offsetview_unmanaged_construction_death() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + // Regular expression syntax on Windows is a pain. `.` does not match `\n`. + // Feel free to make it work if you have time to spare. +#ifdef _WIN32 +#define SKIP_REGEX_ON_WINDOWS(REGEX) "" +#else +#define SKIP_REGEX_ON_WINDOWS(REGEX) REGEX +#endif { using offset_view_type = Kokkos::Experimental::OffsetView<Scalar*, Device>; // Range calculations must be positive - ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); - ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + (void)offset_view_type(&s, {0}, {1}); + (void)offset_view_type(&s, {0}, {0}); + ASSERT_DEATH( + offset_view_type(&s, {0}, {-1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(-1\\) - begins\\[0\\] \\(0\\)\\) must be " + "non-negative")); } { using offset_view_type = Kokkos::Experimental::OffsetView<Scalar*, Device>; // Range calculations must not overflow - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); - ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW( + (void)offset_view_type(&s, {0}, {0x7fffffffffffffffl}); + ASSERT_DEATH( + offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-1\\)\\) " + "overflows")); + ASSERT_DEATH( offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), - std::runtime_error); + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); + ASSERT_DEATH( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(0\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); } { using offset_view_type = Kokkos::Experimental::OffsetView<Scalar**, Device>; - // Should throw when the rank of begins and/or ends doesn't match that of - // OffsetView - ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); - ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), - std::runtime_error); + // Should throw when the rank of begins and/or ends doesn't match that + // of OffsetView + ASSERT_DEATH( + offset_view_type(&s, {0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + (void)offset_view_type(&s, {0, 0}, {1, 1}); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); } +#undef SKIP_REGEX_ON_WINDOWS } template <typename Scalar, typename Device> @@ -377,8 +465,8 @@ void test_offsetview_subview() { Kokkos::Experimental::OffsetView<Scalar*, Device> sliceMe("offsetToSlice", {-10, 20}); { - auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); - ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubview.rank(), 0u) << "subview of offset is broken."; } } { // test subview 2 @@ -387,13 +475,13 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -406,30 +494,29 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; ASSERT_EQ(offsetSubview.begin(0), -20); ASSERT_EQ(offsetSubview.end(0), 31); ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> >; using point_type = typename range_type::point_type; @@ -455,25 +542,24 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -486,73 +572,72 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } // slice 2 auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2a.rank(), 2u) << "subview of offset is broken."; { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } // slice 3 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template <class InputIt, class T, class BinaryOperation> KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -586,6 +671,7 @@ void test_offsetview_offsets_rank1() { KOKKOS_LAMBDA(const int ii, int& lerrors) { offset_view_type ov(v, {ii}); lerrors += (ov(3) != element({3 - ii})); + lerrors += (ov[3] != element({3 - ii})); }, errors); @@ -655,7 +741,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction<int, TEST_EXECSPACE>(); @@ -665,11 +750,15 @@ TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction<int, TEST_EXECSPACE>(); } +TEST(TEST_CATEGORY_DEATH, offsetview_unmanaged_construction) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_offsetview_unmanaged_construction_death<int, TEST_EXECSPACE>(); +} + TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview<int, TEST_EXECSPACE>(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1<TEST_EXECSPACE>(); } @@ -681,7 +770,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3<TEST_EXECSPACE>(); } -#endif } // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp index 733f43122ce90f0dcb568fe1140559641bc40777..72c1afbe96a7ab026a4dd5304b8ab6c133428de3 100644 --- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp @@ -33,11 +33,11 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, NumberType> { public: using scatter_view_type = - Kokkos::Experimental::ScatterView<NumberType * [12], Layout, DeviceType, + Kokkos::Experimental::ScatterView<NumberType* [12], Layout, DeviceType, Kokkos::Experimental::ScatterSum, Duplication, Contribution>; - using orig_view_type = Kokkos::View<NumberType * [12], Layout, DeviceType>; + using orig_view_type = Kokkos::View<NumberType* [12], Layout, DeviceType>; using size_type = typename Kokkos::HostSpace::size_type; @@ -134,11 +134,11 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, NumberType> { public: using scatter_view_type = - Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType, + Kokkos::Experimental::ScatterView<NumberType* [3], Layout, DeviceType, Kokkos::Experimental::ScatterProd, Duplication, Contribution>; - using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>; + using orig_view_type = Kokkos::View<NumberType* [3], Layout, DeviceType>; using size_type = typename Kokkos::HostSpace::size_type; @@ -235,11 +235,11 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, NumberType> { public: using scatter_view_type = - Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType, + Kokkos::Experimental::ScatterView<NumberType* [3], Layout, DeviceType, Kokkos::Experimental::ScatterMin, Duplication, Contribution>; - using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>; + using orig_view_type = Kokkos::View<NumberType* [3], Layout, DeviceType>; using size_type = typename Kokkos::HostSpace::size_type; @@ -335,11 +335,11 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, NumberType> { public: using scatter_view_type = - Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType, + Kokkos::Experimental::ScatterView<NumberType* [3], Layout, DeviceType, Kokkos::Experimental::ScatterMax, Duplication, Contribution>; - using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>; + using orig_view_type = Kokkos::View<NumberType* [3], Layout, DeviceType>; using size_type = typename Kokkos::HostSpace::size_type; @@ -714,7 +714,7 @@ void test_scatter_view(int64_t n) { test_sv_config.run_test(n); } #ifdef KOKKOS_ENABLE_SERIAL - if (!std::is_same<DeviceType, Kokkos::Serial>::value) { + if (!std::is_same_v<DeviceType, Kokkos::Serial>) { #endif test_scatter_view_config<DeviceType, Kokkos::LayoutRight, Kokkos::Experimental::ScatterNonDuplicated, diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index acff5d0be04bf6d1c9afeeb95f550844986c4fc3..b1fdf54ed8e014cbdac9ce5aef47626a0e833723 100644 --- a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -207,7 +207,7 @@ void run_test_graph4() { const ordinal_type nnz = 24; ptr_row_map_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24}; ptr_entries_type indRaw[] = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3, - 4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7}; + 4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7}; // Wrap pointers in unmanaged host views using local_row_map_type = typename hView::row_map_type; @@ -237,14 +237,14 @@ void run_test_graph4() { dx.row_map = typename dView::row_map_type(tmp_row_map.data(), numRows + 1); dx.entries = typename dView::entries_type(tmp_entries.data(), nnz); - ASSERT_TRUE((std::is_same<typename dView::row_map_type::memory_traits, - Kokkos::MemoryUnmanaged>::value)); - ASSERT_TRUE((std::is_same<typename dView::entries_type::memory_traits, - Kokkos::MemoryUnmanaged>::value)); - ASSERT_TRUE((std::is_same<typename hView::row_map_type::memory_traits, - Kokkos::MemoryUnmanaged>::value)); - ASSERT_TRUE((std::is_same<typename hView::entries_type::memory_traits, - Kokkos::MemoryUnmanaged>::value)); + ASSERT_TRUE((std::is_same_v<typename dView::row_map_type::memory_traits, + Kokkos::MemoryUnmanaged>)); + ASSERT_TRUE((std::is_same_v<typename dView::entries_type::memory_traits, + Kokkos::MemoryUnmanaged>)); + ASSERT_TRUE((std::is_same_v<typename hView::row_map_type::memory_traits, + Kokkos::MemoryUnmanaged>)); + ASSERT_TRUE((std::is_same_v<typename hView::entries_type::memory_traits, + Kokkos::MemoryUnmanaged>)); } } /* namespace TestStaticCrsGraph */ diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp index f63f1c6afe37faf507ab6e2abd43007801274071..fc7435a75e564a63ea13901bf7744a1a28be3d66 100644 --- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -68,7 +68,7 @@ struct TestInsert { } while (rehash_on_fail && failed_count > 0u); // Trigger the m_size mutable bug. - typename map_type::HostMirror map_h; + auto map_h = create_mirror(map); execution_space().fence(); Kokkos::deep_copy(map_h, map); execution_space().fence(); @@ -367,7 +367,7 @@ void test_deep_copy(uint32_t num_nodes) { } } - host_map_type hmap; + auto hmap = create_mirror(map); Kokkos::deep_copy(hmap, map); ASSERT_EQ(map.size(), hmap.size()); @@ -380,6 +380,7 @@ void test_deep_copy(uint32_t num_nodes) { } map_type mmap; + mmap.allocate_view(hmap); Kokkos::deep_copy(mmap, hmap); const_map_type cmap = mmap; @@ -424,7 +425,7 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) { Map n{}; n = Map{m.capacity()}; n.rehash(m.capacity()); - Kokkos::deep_copy(n, m); + n.create_copy_view(m); ASSERT_TRUE(m.is_allocated()); ASSERT_TRUE(n.is_allocated()); } @@ -459,7 +460,7 @@ struct UnorderedMapInsert { //! Insert multiple values. template <typename... Args> - void insert(Args &&... args) const { + void insert(Args &&...args) const { static_assert(sizeof...(Args) > 1, "Prefer the single value version"); constexpr size_t size = sizeof...(Args); Kokkos::Array<typename map_type::key_type, size> values{ @@ -533,8 +534,6 @@ TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { ASSERT_EQ(1u, test_map_copy.m_map.size()); } -#if !defined(KOKKOS_ENABLE_CUDA) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) void test_unordered_map_device_capture() { TestMapCopy::map_type map; @@ -548,7 +547,6 @@ void test_unordered_map_device_capture() { TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { test_unordered_map_device_capture(); } -#endif /** * @test This test ensures that an @ref UnorderedMap can be built diff --git a/packages/kokkos/containers/unit_tests/TestVector.hpp b/packages/kokkos/containers/unit_tests/TestVector.hpp index a7d341b789d6d6c61f3037b8bca492fca9860752..abed2676d76dd01874374638a1969c6a1fda2be1 100644 --- a/packages/kokkos/containers/unit_tests/TestVector.hpp +++ b/packages/kokkos/containers/unit_tests/TestVector.hpp @@ -21,6 +21,8 @@ #include <iostream> #include <cstdlib> #include <cstdio> +#include <Kokkos_Macros.hpp> +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #include <Kokkos_Vector.hpp> namespace Test { @@ -231,7 +233,7 @@ void test_vector_allocate(unsigned int size) { TEST(TEST_CATEGORY, vector_combination) { test_vector_allocate<int, TEST_EXECSPACE>(10); test_vector_combinations<int, TEST_EXECSPACE>(10); - test_vector_combinations<int, TEST_EXECSPACE>(3057); + test_vector_combinations<long long int, TEST_EXECSPACE>(3057); } TEST(TEST_CATEGORY, vector_insert) { diff --git a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp index 0246f11ddfe7ba4fc557e10ce6a15592341e3068..2edddcce34f451707f42ad239cd608cf25ef7e7e 100644 --- a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp +++ b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -48,7 +48,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create two views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -78,16 +78,16 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same<CommonViewValueType, double>::value), true); + ASSERT_EQ((std::is_same_v<CommonViewValueType, double>), true); #if 0 // debug output - for ( int i = 0; i < N0*N1; ++i ) { - printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + for ( size_t i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%zu) = %lf\n ", i, hcv1(i) ); } printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same<CommonViewValueType, int>::value), true); + ASSERT_EQ((std::is_same_v<CommonViewValueType, int>), true); } } @@ -148,7 +148,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same<CommonViewValueType, double>::value), true); + ASSERT_EQ((std::is_same_v<CommonViewValueType, double>), true); } { @@ -169,7 +169,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same<CommonViewValueType, int>::value), true); + ASSERT_EQ((std::is_same_v<CommonViewValueType, int>), true); } } diff --git a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index 7201cd402a95a5db073ba1a2908f0f9c25520f25..2932898554c52a68e2214996200b5085549ac6d5 100644 --- a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -37,11 +37,27 @@ #endif ///@} +/// Some tests are skipped for unified memory space +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \ + Kokkos::CudaSpace>) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; +#elif defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \ + Kokkos::HIPSpace>) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; +#else +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE +#endif + TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::DualView<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 5, 6, 7, - 8); + Kokkos::DualView<int**** [1][2][3][4], TEST_EXECSPACE> bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -71,8 +87,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::DualView<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, - 5); + Kokkos::DualView<int**** [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -101,8 +116,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::DualView<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, - 5); + Kokkos::DualView<int**** [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -234,7 +248,7 @@ TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same<TEST_EXECSPACE, Kokkos::Threads>::value) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Threads>) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif @@ -269,7 +283,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 4, 5, 6, 7); auto success = validate_absence( @@ -301,7 +315,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -332,7 +346,7 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -373,13 +387,12 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same<typename TEST_EXECSPACE, Kokkos::Threads>::value) + if (std::is_same_v<typename TEST_EXECSPACE, Kokkos::Threads>) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::HPX>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::HPX>) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif @@ -657,6 +670,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) { TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) { GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), diff --git a/packages/kokkos/core/CMakeLists.txt b/packages/kokkos/core/CMakeLists.txt index 0917928001a92749dacf4f863df4367e2e3a06ea..21f05f627242b1b2d46041b2de63393b067b55d4 100644 --- a/packages/kokkos/core/CMakeLists.txt +++ b/packages/kokkos/core/CMakeLists.txt @@ -1,22 +1,14 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() - ENDIF() +function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + if(NOT Kokkos_ENABLE_BENCHMARKS) + return() + endif() - IF(KOKKOS_HAS_TRILINOS) - message( - STATUS - "Benchmarks are not supported when building as part of Trilinos" - ) - RETURN() - ENDIF() + add_subdirectory(${DIR_NAME}) +endfunction() - ADD_SUBDIRECTORY(${DIR_NAME}) -ENDFUNCTION() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) +kokkos_add_test_directories(unit_test) +kokkos_add_benchmark_directory(perf_test) diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt index 7f3916da31272e796a5cc083ead1138f7deaa62a..0cb2c804d38370578d2586602472da2523e22be7 100644 --- a/packages/kokkos/core/perf_test/CMakeLists.txt +++ b/packages/kokkos/core/perf_test/CMakeLists.txt @@ -1,199 +1,135 @@ # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features -IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - RETURN() -ENDIF() -IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - RETURN() -ENDIF() +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) -IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) - ENDIF() - - KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - - IF(NOT Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET needs tasking - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() -ENDIF() - -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() - -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") -ENDIF() +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() + + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + + kokkos_add_executable_and_test(PerformanceTest_TaskDag SOURCES test_taskdag.cpp CATEGORIES PERFORMANCE) +endif() + +if(NOT Kokkos_ENABLE_BENCHMARKS) + return() +endif() # Find or download google/benchmark library find_package(benchmark QUIET 1.5.6) -IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") -ELSE() +if(benchmark_FOUND) + message(STATUS "Using google benchmark found in ${benchmark_DIR}") +else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_TESTING OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( googlebenchmark DOWNLOAD_EXTRACT_TIMESTAMP FALSE - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b + URL https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz + URL_HASH MD5=0459a6c530df9851bee6504c3e37c2e7 ) FetchContent_MakeAvailable(googlebenchmark) list(POP_BACK CMAKE_MESSAGE_INDENT) # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") + endif() target_compile_options(benchmark PRIVATE -w) target_compile_options(benchmark_main PRIVATE -w) -ENDIF() +endif() +function(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) + if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) + endif() -FUNCTION(KOKKOS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() - - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - LIST(APPEND BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - ) + set(BENCHMARK_NAME Kokkos_${NAME}) + list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE} - ) - ENDFOREACH() - - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) + foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + endforeach() - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) -ENDFUNCTION() - -SET( - BENCHMARK_SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTestHexGrad.cpp - PerfTest_MallocFree.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewCopy_a123.cpp - PerfTest_ViewCopy_b123.cpp - PerfTest_ViewCopy_c123.cpp - PerfTest_ViewCopy_d123.cpp - PerfTest_ViewCopy_a45.cpp - PerfTest_ViewCopy_b45.cpp - PerfTest_ViewCopy_c45.cpp - PerfTest_ViewCopy_d45.cpp - PerfTest_ViewCopy_a6.cpp - PerfTest_ViewCopy_b6.cpp - PerfTest_ViewCopy_c6.cpp - PerfTest_ViewCopy_d6.cpp - PerfTest_ViewCopy_a7.cpp - PerfTest_ViewCopy_b7.cpp - PerfTest_ViewCopy_c7.cpp - PerfTest_ViewCopy_d7.cpp - PerfTest_ViewCopy_a8.cpp - PerfTest_ViewCopy_b8.cpp - PerfTest_ViewCopy_c8.cpp - PerfTest_ViewCopy_d8.cpp - PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewFill_Raw.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - PerfTest_ViewResize_Raw.cpp -) + string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) + + add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) +endfunction() -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM BENCHMARK_SOURCES +set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + PerfTestHexGrad.cpp + PerfTest_MallocFree.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_BENCHMARK( - Benchmark_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp +if(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp ) -ENDIF() +endif() + +kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) + +kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) # FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_BENCHMARK( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - ) -ENDIF() +if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) +endif() -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Atomic - SOURCES test_atomic.cpp -) +kokkos_add_benchmark(PerformanceTest_Atomic SOURCES test_atomic.cpp) diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp index 98cb246c71e1faed8191791af2111c42acd0b300..1ebe750f2164ab93f8fcec0546e2f2372511ec58 100644 --- a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -34,10 +34,10 @@ struct HexGrad { enum { NSpace = 3, NNode = 8 }; using elem_coord_type = - Kokkos::View<CoordScalarType * [NSpace][NNode], execution_space>; + Kokkos::View<CoordScalarType* [NSpace][NNode], execution_space>; using elem_grad_type = - Kokkos::View<GradScalarType * [NSpace][NNode], execution_space>; + Kokkos::View<GradScalarType* [NSpace][NNode], execution_space>; elem_coord_type coords; elem_grad_type grad_op; diff --git a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2110f38a916f813b8a4a96f6af5899a6c59910c9..03340a5d6de4841fb3da4f59a2fbd64cce1acac9 100644 --- a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -21,7 +21,6 @@ #include <Kokkos_Random.hpp> #include <utility> -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template <class Scalar> std::pair<double, Scalar> custom_reduction_test(int N, int R) { @@ -130,4 +129,3 @@ BENCHMARK(CustomReduction<double>) ->UseManualTime(); } // namespace Test -#endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823a214de6f73d44e5280f1183fe1c5ae..aa23ddbb60722506b8dd58345401d9cd691d89c8 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -56,8 +56,7 @@ bool is_overlapping<Kokkos::HIP>(const Kokkos::HIP&) { #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -bool is_overlapping<Kokkos::Experimental::SYCL>( - const Kokkos::Experimental::SYCL&) { +bool is_overlapping<Kokkos::SYCL>(const Kokkos::SYCL&) { return true; } #endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e555451e7426b00b2a2f3a3dfe8d9eb03e..e4db40e128c3336e79577e39d0cc10abbe1e0c30 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw<Kokkos::LayoutLeft, Kokkos::LayoutLeft>) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw<Kokkos::LayoutRight, Kokkos::LayoutLeft>) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d9154fd71ff9d7bfc5678d6398d6e2aee5..57bba83a9c1ebca7806663dd26b756ee06603067 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw<Kokkos::LayoutLeft>) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw<Kokkos::LayoutRight>) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3cab5edcda25fffe46b19d69f4c1f7d73..ab469cb647ca2bc751cfdab5e40d821811925429 100644 --- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutLeft>) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw<Kokkos::LayoutRight>) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/packages/kokkos/core/perf_test/test_atomic.cpp b/packages/kokkos/core/perf_test/test_atomic.cpp index ce3059f47d32d9816c5303fcaa61accb52013b97..af74723e7e01ca070c9fef44f9b1b061f2cfed85 100644 --- a/packages/kokkos/core/perf_test/test_atomic.cpp +++ b/packages/kokkos/core/perf_test/test_atomic.cpp @@ -390,7 +390,7 @@ static void Test_Atomic(benchmark::State& state) { static constexpr int LOOP = 100'000; -BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10); +BENCHMARK(Test_Atomic<int>)->Arg(30'000)->Iterations(10); BENCHMARK(Test_Atomic<long int>)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic<long long int>)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic<unsigned int>)->Arg(LOOP)->Iterations(10); @@ -398,4 +398,3 @@ BENCHMARK(Test_Atomic<unsigned long int>)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic<unsigned long long int>)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic<float>)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic<double>)->Arg(LOOP)->Iterations(10); -BENCHMARK(Test_Atomic<int>)->Arg(LOOP)->Iterations(10); diff --git a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp index b838c8eccf02e7177c07886be5b775130b405253..bc35d1c776f8286ea1b6c7babc853d5861f86ee3 100644 --- a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp +++ b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp @@ -183,7 +183,8 @@ double atomic_contentious_max_replacement(benchmark::State& state, Kokkos::parallel_reduce( con_length, KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1); + inner = Kokkos::atomic_max_fetch(&(input(0)), + Kokkos::min(inner, max - 1) + 1); if (i == con_length - 1) { Kokkos::atomic_max_fetch(&(input(0)), max); inner = max; @@ -223,7 +224,8 @@ double atomic_contentious_min_replacement(benchmark::State& state, Kokkos::parallel_reduce( con_length, KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1); + inner = Kokkos::atomic_min_fetch(&(input(0)), + Kokkos::max(inner, min + 1) - 1); if (i == con_length - 1) { Kokkos::atomic_min_fetch(&(input(0)), min); inner = min; @@ -246,7 +248,7 @@ static void Atomic_ContentiousMinReplacements(benchmark::State& state) { auto inp = prepare_input(1, std::numeric_limits<T>::max()); for (auto _ : state) { - const auto time = atomic_contentious_max_replacement(state, inp, length); + const auto time = atomic_contentious_min_replacement(state, inp, length); state.SetIterationTime(time); } diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp index 9905740afb4d85e78dfe926fee81ac65bfe0949c..bdfe59b0b3bcb099467dff04170fb2c42b373e92 100644 --- a/packages/kokkos/core/perf_test/test_mempool.cpp +++ b/packages/kokkos/core/perf_test/test_mempool.cpp @@ -198,7 +198,7 @@ static void Mempool_Fill(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, @@ -225,7 +225,7 @@ static void Mempool_Alloc_Dealloc(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, diff --git a/packages/kokkos/core/perf_test/test_sharedSpace.cpp b/packages/kokkos/core/perf_test/test_sharedSpace.cpp index 4f140c9409ad5e3b4db5d7b3522bbb2f931cceb8..3c06770e28612cc37d9e837fa3dc0331daf1feba 100644 --- a/packages/kokkos/core/perf_test/test_sharedSpace.cpp +++ b/packages/kokkos/core/perf_test/test_sharedSpace.cpp @@ -103,7 +103,7 @@ size_t getDeviceMemorySize() { #elif defined KOKKOS_ENABLE_HIP return Kokkos::HIP{}.hip_device_prop().totalGlobalMem; #elif defined KOKKOS_ENABLE_SYCL - auto device = Kokkos::Experimental::SYCL{}.sycl_queue().get_device(); + auto device = Kokkos::SYCL{}.sycl_queue().get_device(); return device.get_info<sycl::info::device::global_mem_size>(); #else #error \ diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp index fccaab64ddf1821169484b533b1f64dcab585485..347d9748b5a9f8dcd7e757efecf32d85e3b68645 100644 --- a/packages/kokkos/core/perf_test/test_taskdag.cpp +++ b/packages/kokkos/core/perf_test/test_taskdag.cpp @@ -32,6 +32,11 @@ int main() { return 0; } #include <Kokkos_Timer.hpp> +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + using ExecSpace = Kokkos::DefaultExecutionSpace; inline long eval_fib(long n) { @@ -223,4 +228,8 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt index 012af0a7d06ac5df551ceae49ca06bdee77802f6..72663739a1423c89bb0c12bfcaf98fac7343e699 100644 --- a/packages/kokkos/core/src/CMakeLists.txt +++ b/packages/kokkos/core/src/CMakeLists.txt @@ -1,116 +1,125 @@ -KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} - ${KOKKOS_TOP_BUILD_DIR} -) -IF (NOT desul_FOUND) - IF(KOKKOS_ENABLE_CUDA) - SET(DESUL_ATOMICS_ENABLE_CUDA ON) - ENDIF() - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP) - SET(DESUL_ATOMICS_ENABLE_HIP ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_SYCL) - SET(DESUL_ATOMICS_ENABLE_SYCL ON) - ENDIF() - IF(KOKKOS_ENABLE_OPENMPTARGET) - SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - ENDIF() - CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp - ) - KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR}) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + set(DESUL_ATOMICS_ENABLE_CUDA ON) + endif() + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_HIP) + set(DESUL_ATOMICS_ENABLE_HIP ON) + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_SYCL) + set(DESUL_ATOMICS_ENABLE_SYCL ON) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + endif() + endif() + if(KOKKOS_ENABLE_OPENMPTARGET) + set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + endif() + if(KOKKOS_ENABLE_OPENACC) + # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(DESUL_ATOMICS_ENABLE_OPENACC ON) + endif() + endif() + configure_file( + ${KOKKOS_SOURCE_DIR}/tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp ) -ENDIF() + kokkos_include_directories(${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/" +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h" ) -SET(KOKKOS_CORE_SRCS) -APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CORE_HEADERS) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) - -IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENMPTARGET) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_OPENACC) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_THREADS) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_HPX) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) -ENDIF() - -IF (NOT KOKKOS_ENABLE_MEMKIND) - LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp) -ENDIF() - -IF (KOKKOS_ENABLE_SERIAL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) -ENDIF() - -IF (NOT desul_FOUND) - IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) - ELSEIF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) - ELSEIF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) - ENDIF() - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) - - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" - "${CMAKE_CURRENT_BINARY_DIR}/desul" +set(KOKKOS_CORE_SRCS) +append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CORE_HEADERS) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) + +if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/Kokkos_Cuda_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/Kokkos_OpenMP_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +endif() + +if(KOKKOS_ENABLE_OPENACC) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) +endif() + +if(KOKKOS_ENABLE_THREADS) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +endif() + +if(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +endif() + +if(KOKKOS_ENABLE_HPX) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/Kokkos_HPX_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +endif() + +if(KOKKOS_ENABLE_SERIAL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/Kokkos_Serial_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +endif() + +if(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +endif() + +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_CUDA.cpp) + elseif(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_HIP.cpp) + elseif(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_SYCL.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/*/*/*.inc*) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) + + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul" "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" @@ -118,33 +127,26 @@ IF (NOT desul_FOUND) PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal desul_atomics copy") -ELSE() - MESSAGE(STATUS "Using external desul_atomics install found at:") - MESSAGE(STATUS " " ${desul_DIR}) -ENDIF() - + message(STATUS "Using internal desul_atomics copy") +else() + message(STATUS "Using external desul_atomics install found at:") + message(STATUS " " ${desul_DIR}) +endif() -KOKKOS_ADD_LIBRARY( - kokkoscore - SOURCES ${KOKKOS_CORE_SRCS} - HEADERS ${KOKKOS_CORE_HEADERS} +kokkos_add_library( + kokkoscore SOURCES ${KOKKOS_CORE_SRCS} HEADERS ${KOKKOS_CORE_HEADERS} ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (NOT desul_FOUND) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() +if(NOT desul_FOUND) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -IF (Kokkos_ENABLE_IMPL_MDSPAN) - MESSAGE(STATUS "Experimental mdspan support is enabled") +if(Kokkos_ENABLE_IMPL_MDSPAN) + message(STATUS "Experimental mdspan support is enabled") # Some compilers now include mdspan... we just flag on their version # for now until we can get some compiler detection support @@ -152,67 +154,56 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) check_include_file_cxx(experimental/mdspan KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN) check_include_file_cxx(mdspan KOKKOS_COMPILER_SUPPORTS_MDSPAN) - if (Kokkos_ENABLE_MDSPAN_EXTERNAL) - MESSAGE(STATUS "Using external mdspan") + if(Kokkos_ENABLE_MDSPAN_EXTERNAL) + message(STATUS "Using external mdspan") target_link_libraries(kokkoscore PUBLIC std::mdspan) elseif(KOKKOS_COMPILER_SUPPORTS_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied mdspan") elseif(KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied experimental/mdspan") else() - KOKKOS_LIB_INCLUDE_DIRECTORIES( - kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include - ) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/__p0009_bits/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/mdspan) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/__p0009_bits/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/mdspan) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "mdspan" PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal mdspan directory ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include") + message(STATUS "Using internal mdspan directory ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include") endif() -ENDIF() - -KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -ENDIF() -KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) +endif() + +kokkos_link_tpl(kokkoscore PUBLIC HWLOC) +kokkos_link_tpl(kokkoscore PUBLIC CUDA) +kokkos_link_tpl(kokkoscore PUBLIC HPX) +kokkos_link_tpl(kokkoscore PUBLIC LIBDL) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread -IF (NOT WIN32) - KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) -ENDIF() +if(NOT WIN32) + kokkos_link_tpl(kokkoscore PUBLIC THREADS) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_link_tpl(kokkoscore PUBLIC ROCM) +endif() # FIXME: We need a proper solution to figure out whether to enable # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (KOKKOS_ENABLE_OPENMPTARGET) +if(KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) -ENDIF() +endif() -IF (desul_FOUND) +if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) -ENDIF() +endif() -# FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency so we -# just append the flags in cmake/kokkos_tpls.cmake instead of linking with the -# OpenMP target. -IF(Kokkos_ENABLE_OPENMP AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_OPENMP) target_link_libraries(kokkoscore PUBLIC OpenMP::OpenMP_CXX) -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) +kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index 8bfaf8317b6594e0ce4e11f6183605714c1c93e9..07c35e6611f100254dc1c41a55bc03d79fa041e8 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -35,7 +35,6 @@ static_assert(false, #include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL #include <Kokkos_Parallel.hpp> -#include <Kokkos_TaskScheduler.hpp> #include <Kokkos_Layout.hpp> #include <Kokkos_ScratchSpace.hpp> #include <Kokkos_MemoryTraits.hpp> @@ -46,7 +45,6 @@ static_assert(false, namespace Kokkos { namespace Impl { -class CudaExec; class CudaInternal; } // namespace Impl } // namespace Kokkos @@ -129,33 +127,16 @@ class Cuda { /// \brief True if and only if this method is being called in a /// thread-parallel function. - KOKKOS_INLINE_FUNCTION static int in_parallel() { + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__CUDA_ARCH__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. - * - * This function sets the device in a "sleep" state in which it is - * not ready for work. This may consume less resources than if the - * device were in an "awake" state, but it may also take time to - * bring the device from a sleep state to be ready for work. - * - * \return True if the device is in the "sleep" state, else false if - * the device is actively working and could not enter the "sleep" - * state. - */ - static bool sleep(); - - /// \brief Wake the device from the 'sleep' state so it is ready for work. - /// - /// \return True if the device is in the "ready" state, else "false" - /// if the device is actively working (which also means that it's - /// awake). - static bool wake(); +#endif /// \brief Wait until all dispatched functors complete. /// @@ -184,8 +165,17 @@ class Cuda { Cuda(); - Cuda(cudaStream_t stream, - Impl::ManageStream manage_stream = Impl::ManageStream::no); + explicit Cuda(cudaStream_t stream) : Cuda(stream, Impl::ManageStream::no) {} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT( + "Cuda execution space should be constructed explicitly.") + Cuda(cudaStream_t stream) + : Cuda(stream) {} +#endif + + Cuda(cudaStream_t stream, Impl::ManageStream manage_stream); KOKKOS_DEPRECATED Cuda(cudaStream_t stream, bool manage_stream); @@ -199,18 +189,37 @@ class Cuda { //! Initialize, telling the CUDA run-time library which device to use. static void impl_initialize(InitializationSettings const&); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Cuda device architecture of the selected device. /// /// This matches the __CUDA_ARCH__ specification. - static size_type device_arch(); + KOKKOS_DEPRECATED static size_type device_arch() { + const cudaDeviceProp cudaProp = Cuda().cuda_device_prop(); + return cudaProp.major * 100 + cudaProp.minor; + } //! Query device count. - static size_type detect_device_count(); + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; + } /** \brief Detect the available devices and their architecture * as defined by the __CUDA_ARCH__ specification. */ - static std::vector<unsigned> detect_device_arch(); + KOKKOS_DEPRECATED static std::vector<unsigned> detect_device_arch() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + std::vector<unsigned> out; + for (int i = 0; i < count; ++i) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + out.push_back(prop.major * 100 + prop.minor); + } + return out; + } +#endif cudaStream_t cuda_stream() const; int cuda_device() const; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index c6512f44dadc974b381975ec67feda62789e4e4a..8bcd6525c96247576ea647fee37ea71aea8383c6 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -31,9 +31,7 @@ #include <algorithm> #include <atomic> -//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> #include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_MemorySpace.hpp> #include <impl/Kokkos_Tools.hpp> @@ -60,12 +58,6 @@ const std::unique_ptr<Kokkos::Cuda> &Kokkos::Impl::cuda_get_deep_copy_space( namespace Kokkos { namespace Impl { -namespace { - -static std::atomic<int> num_uvm_allocations(0); - -} // namespace - void DeepCopyCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( dst, src, n, cudaMemcpyDefault))); @@ -83,11 +75,11 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL( (CudaInternal::singleton().cuda_memcpy_async_wrapper( dst, src, n, cudaMemcpyDefault, s))); - Impl::cuda_stream_synchronize( - s, + Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>( + "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync", Kokkos::Tools::Experimental::SpecialSynchronizationCases:: DeepCopyResourceSynchronization, - "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync"); + [&]() { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); }); } } // namespace Impl @@ -135,11 +127,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) { namespace Kokkos { -CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaSpace::CudaSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaSpace::CudaSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaUVMSpace::CudaUVMSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaHostPinnedSpace::CudaHostPinnedSpace() {} +CudaHostPinnedSpace::CudaHostPinnedSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} size_t memory_threshold_g = 40000; // 40 kB @@ -161,56 +165,73 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, } namespace { -void *impl_allocate_common(const Cuda &exec_space, const char *arg_label, - const size_t arg_alloc_size, +void *impl_allocate_common(const int device_id, + [[maybe_unused]] const cudaStream_t stream, + const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - bool exec_space_provided) { + [[maybe_unused]] bool stream_sync_only) { void *ptr = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! +#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + // This is intended for Grace-Hopper (and future unified memory architectures) + // The idea is to use host allocator and then advise to keep it in HBM on the + // device, but that requires CUDA 12.2 + static_assert(CUDART_VERSION >= 12020, + "CUDA runtime version >=12.2 required when " + "Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. " + "Please update your CUDA runtime version or " + "reconfigure with " + "-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF"); + if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr + error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); + if (error_code == cudaSuccess) { + // One would think cudaMemLocation{device_id, + // cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of + // members doesn't seem to be defined. + cudaMemLocation loc; + loc.id = device_id; + loc.type = cudaMemLocationTypeDevice; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2( + ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc)); + } + } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - cudaError_t error_code; - if (arg_alloc_size >= memory_threshold_g) { - if (exec_space_provided) { - error_code = - exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - exec_space.fence("Kokkos::Cuda: backend fence after async malloc"); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async malloc"); + // FIXME_KEPLER Everything after Kepler should support cudaMallocAsync + int device_supports_cuda_malloc_async; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceGetAttribute(&device_supports_cuda_malloc_async, + cudaDevAttrMemoryPoolsSupported, device_id)); + + if (arg_alloc_size >= memory_threshold_g && + device_supports_cuda_malloc_async == 1) { + error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); + + if (error_code == cudaSuccess) { + if (stream_sync_only) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + } else { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async malloc"); + } } } else { - error_code = - (exec_space_provided - ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size) - : Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size)); + error_code = cudaMalloc(&ptr, arg_alloc_size); } #else - cudaError_t error_code; - if (exec_space_provided) { - error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } + error_code = cudaMalloc(&ptr, arg_alloc_size); #endif + if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper(); - throw Experimental::CudaRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - CudaMalloc); + cudaGetLastError(); + Kokkos::Impl::throw_bad_alloc(arg_handle.name, arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { @@ -226,7 +247,7 @@ void *CudaSpace::impl_allocate( const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size, + return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size, arg_logical_size, arg_handle, false); } @@ -234,8 +255,9 @@ void *CudaSpace::impl_allocate( const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(exec_space, arg_label, arg_alloc_size, - arg_logical_size, arg_handle, true); + return impl_allocate_common( + exec_space.cuda_device(), exec_space.cuda_stream(), arg_label, + arg_alloc_size, arg_logical_size, arg_handle, true); } void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { @@ -254,30 +276,24 @@ void *CudaUVMSpace::impl_allocate( Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation"); if (arg_alloc_size > 0) { - Kokkos::Impl::num_uvm_allocations++; - - auto error_code = - Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper( - &ptr, arg_alloc_size, cudaMemAttachGlobal); - -#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST - if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper( - ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId))); -#endif + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); - throw Experimental::CudaRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - CudaMallocManaged); + cudaGetLastError(); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, + cudaCpuDeviceId)); +#endif } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation"); @@ -302,17 +318,15 @@ void *CudaHostPinnedSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; - auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper( - &ptr, arg_alloc_size, cudaHostAllocDefault); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); - throw Experimental::CudaRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - CudaHostAlloc); + cudaGetLastError(); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -343,28 +357,27 @@ void CudaSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { #ifndef CUDART_VERSION #error CUDART_VERSION undefined! +#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence before async free"); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_async_wrapper( - arg_alloc_ptr))); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async free"); - } else { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); - } + if (arg_alloc_size >= memory_threshold_g) { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence before async free"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async free"); + } else { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); + } #else - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #endif - } catch (...) { - } } void CudaUVMSpace::deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const { @@ -390,13 +403,9 @@ void CudaUVMSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { - if (arg_alloc_ptr != nullptr) { - Kokkos::Impl::num_uvm_allocations--; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); - } - } catch (...) { + if (arg_alloc_ptr != nullptr) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation"); @@ -423,11 +432,8 @@ void CudaHostPinnedSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } - try { - KOKKOS_IMPL_CUDA_SAFE_CALL(( - Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr))); - } catch (...) { - } + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } } // namespace Kokkos @@ -438,160 +444,6 @@ void CudaHostPinnedSpace::impl_deallocate( namespace Kokkos { namespace Impl { -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord<void, void> - SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record; - -SharedAllocationRecord<void, void> - SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record; - -SharedAllocationRecord<void, void> - SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::s_root_record; -#endif - -//============================================================================== -// <editor-fold desc="SharedAllocationRecord destructors"> {{{1 - -SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -void SharedAllocationRecord<Kokkos::CudaSpace, void>::deep_copy_header_no_exec( - void *ptr, const void *header) { - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(exec, ptr, header, - sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord<Kokkos::CudaSpace, " - "void>::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size, - (SharedAllocationRecord<void, void>::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, - void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size, - (SharedAllocationRecord<void, void>::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -// </editor-fold> end SharedAllocationRecord destructors }}}1 -//============================================================================== - -//============================================================================== -// <editor-fold desc="SharedAllocationRecord constructors"> {{{1 - -SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord<Kokkos::CudaSpace, " - "void>::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord( - const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>:: - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -// </editor-fold> end SharedAllocationRecord constructors }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -620,19 +472,16 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include <impl/Kokkos_SharedAlloc_timpl.hpp> -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon<Kokkos::CudaSpace>; -template class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>; -template class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>; -template class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>; - -} // end namespace Impl -} // end namespace Kokkos +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace); +#endif +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaHostPinnedSpace); // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index b8fa335cd3b229a3a7c5883fe6e74f5c3d84d701..1ccf38a4a158042a5e999d3885e39383efbf8a54 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -68,9 +68,14 @@ class CudaSpace { /*--------------------------------*/ CudaSpace(); - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; + + private: + CudaSpace(int device_id, cudaStream_t stream); + + public: + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; CudaSpace& operator=(const CudaSpace& rhs) = default; ~CudaSpace() = default; @@ -83,15 +88,30 @@ class CudaSpace { void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } +#endif + /**\brief Deallocate untracked memory in the cuda space */ void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; void deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaSpace impl_create(int device_id, cudaStream_t stream) { + return CudaSpace(device_id, stream); + } + private: - template <class, class, class, class> - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const Cuda& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -112,10 +132,10 @@ class CudaSpace { static constexpr const char* name() { return m_name; } private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; static constexpr const char* m_name = "Cuda"; - friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; }; template <> @@ -149,13 +169,28 @@ class CudaUVMSpace { /*--------------------------------*/ CudaUVMSpace(); - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + + private: + CudaUVMSpace(int device_id, cudaStream_t stream); + + public: + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; ~CudaUVMSpace() = default; /**\brief Allocate untracked memory in the cuda space */ + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -167,8 +202,6 @@ class CudaUVMSpace { const size_t arg_logical_size = 0) const; private: - template <class, class, class, class> - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -189,8 +222,13 @@ class CudaUVMSpace { #endif /*--------------------------------*/ + static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) { + return CudaUVMSpace(device_id, stream); + } + private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST static bool kokkos_impl_cuda_pin_uvm_to_host_v; @@ -223,13 +261,28 @@ class CudaHostPinnedSpace { /*--------------------------------*/ CudaHostPinnedSpace(); - CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; - CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; - CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + + private: + CudaHostPinnedSpace(int device_id, cudaStream_t stream); + + public: + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; ~CudaHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -240,9 +293,11 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) { + return CudaHostPinnedSpace(device_id, stream); + } + private: - template <class, class, class, class> - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -258,6 +313,9 @@ class CudaHostPinnedSpace { static constexpr const char* name() { return m_name; } private: + int m_device; + cudaStream_t m_stream; + static constexpr const char* m_name = "CudaHostPinned"; /*--------------------------------*/ @@ -280,22 +338,23 @@ const std::unique_ptr<Kokkos::Cuda>& cuda_get_deep_copy_space( bool initialize = true); static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::CudaSpace>::assignable, - ""); -static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::CudaUVMSpace>::assignable, - ""); + Kokkos::CudaSpace>::assignable); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::CudaHostPinnedSpace>::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); //---------------------------------------- template <> struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> { enum : bool { assignable = false }; - enum : bool { accessible = false }; +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + enum : bool{accessible = false}; +#else + enum : bool { accessible = true }; +#endif enum : bool { deepcopy = true }; }; @@ -516,179 +575,14 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord<Kokkos::CudaSpace, void> - : public HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace> { - private: - friend class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>; - friend class SharedAllocationRecordCommon<Kokkos::CudaSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>; - - using RecordBase = SharedAllocationRecord<void, void>; - using base_t = - HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::CudaSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record, +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::CudaSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace); #endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - // workaround for issue with NVCC and MSVC - // https://github.com/kokkos/kokkos/issues/5258 - deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header); - } - - SharedAllocationRecord( - const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::CudaSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - // helper function to work around MSVC+NVCC issue - // https://github.com/kokkos/kokkos/issues/5258 - static void deep_copy_header_no_exec(void*, const void*); -}; - -template <> -class SharedAllocationRecord<Kokkos::CudaUVMSpace, void> - : public SharedAllocationRecordCommon<Kokkos::CudaUVMSpace> { - private: - friend class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>; - - using base_t = SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaUVMSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void> - : public SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace> { - private: - friend class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>; - - using RecordBase = SharedAllocationRecord<void, void>; - using base_t = SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp index f68e05f7804028e4264224e543af87b4e3edb404..66656fefda5d82502b72adee7a9db8e8a69f47f7 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -22,15 +22,10 @@ #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Profiling.hpp> -#include <iosfwd> namespace Kokkos { namespace Impl { -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string& name); void cuda_device_synchronize(const std::string& name); void cuda_stream_synchronize(const cudaStream_t stream, const std::string& name); @@ -73,52 +68,6 @@ inline void cuda_internal_safe_call(cudaError e, const char* name, Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl - -namespace Experimental { - -class CudaRawMemoryAllocationFailure : public RawMemoryAllocationFailure { - private: - using base_t = RawMemoryAllocationFailure; - - cudaError_t m_error_code = cudaSuccess; - - static FailureMode get_failure_mode(cudaError_t error_code) { - switch (error_code) { - case cudaErrorMemoryAllocation: return FailureMode::OutOfMemoryError; - case cudaErrorInvalidValue: return FailureMode::InvalidAllocationSize; - // TODO handle cudaErrorNotSupported for cudaMallocManaged - default: return FailureMode::Unknown; - } - } - - public: - // using base_t::base_t; - // would trigger - // - // error: cannot determine the exception specification of the default - // constructor due to a circular dependency - // - // using NVCC 9.1 and gcc 7.4 - CudaRawMemoryAllocationFailure( - size_t arg_attempted_size, size_t arg_attempted_alignment, - FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, - AllocationMechanism arg_mechanism = - AllocationMechanism::StdMalloc) noexcept - : base_t(arg_attempted_size, arg_attempted_alignment, arg_failure_mode, - arg_mechanism) {} - - CudaRawMemoryAllocationFailure(size_t arg_attempted_size, - cudaError_t arg_error_code, - AllocationMechanism arg_mechanism) noexcept - : base_t(arg_attempted_size, /* CudaSpace doesn't handle alignment? */ 1, - get_failure_mode(arg_error_code), arg_mechanism), - m_error_code(arg_error_code) {} - - void append_additional_error_information(std::ostream& o) const override; -}; - -} // end namespace Experimental - } // namespace Kokkos #endif // KOKKOS_ENABLE_CUDA diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index a4d064e544a79bec682aff97305f7a0a1e640e73..058b1f538d56166ef65aaeecd3665ea9b87299ac 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -23,8 +23,7 @@ #include <Kokkos_Graph_fwd.hpp> -#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete -#include <impl/Kokkos_SharedAlloc.hpp> // SharedAllocationRecord +#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete #include <Kokkos_Parallel.hpp> #include <Kokkos_Parallel_Reduce.hpp> @@ -50,13 +49,10 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag, // covers and we're not modifying it Kokkos::ObservingRawPtr<const cudaGraph_t> m_graph_ptr = nullptr; Kokkos::ObservingRawPtr<cudaGraphNode_t> m_graph_node_ptr = nullptr; - // Note: owned pointer to CudaSpace memory (used for global memory launches), - // which we're responsible for deallocating, but not responsible for calling - // its destructor. - using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... - mutable Kokkos::OwningRawPtr<base_t> m_driver_storage = nullptr; + mutable std::shared_ptr<base_t> m_driver_storage = nullptr; + std::string label; public: using Policy = PolicyType; @@ -66,27 +62,20 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag, // attached to the policy? // TODO @graph kernel name info propagation template <class PolicyDeduced, class... ArgsDeduced> - GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, Cuda const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) // This is super ugly, but it works for now and is the most minimal change // to the codebase for now... - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} // FIXME @graph Forward through the instance once that works in the backends template <class PolicyDeduced> GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - // We should be the only owner, but this is still the easiest way to - // allocate and deallocate aligned memory for these sorts of things - Record::decrement(Record::get_record(m_driver_storage)); - } - } + : GraphNodeKernelImpl("[unlabeled]", ex, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -97,18 +86,21 @@ class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag, cudaGraphNode_t* get_cuda_graph_node_ptr() const { return m_graph_node_ptr; } cudaGraph_t const* get_cuda_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer( + const CudaSpace& mem) const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - - auto* record = Record::allocate( - Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast<base_t*>(record->data()); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr<base_t>( + static_cast<base_t*>(mem.allocate(alloc_label.c_str(), sizeof(base_t))), + [alloc_label, mem](base_t* ptr) { + mem.deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr) - return m_driver_storage; + return m_driver_storage.get(); } + + auto get_driver_storage() const { return m_driver_storage; } }; struct CudaGraphNodeAggregateKernel { @@ -140,7 +132,8 @@ struct get_graph_node_kernel_type<KernelType, Kokkos::ParallelReduceTag> // <editor-fold desc="get_cuda_graph_*() helper functions"> {{{1 template <class KernelType> -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const CudaSpace& mem, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type<KernelType>::type; auto const& kernel_as_graph_kernel = @@ -148,7 +141,7 @@ auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(mem); } template <class KernelType> diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index fcc3ff04ff58c24f3c6939c915b3ceeaeedf5776..8e800e756d2b1a6b0104c0701e3de369ac34d0b2 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -51,7 +51,14 @@ struct GraphImpl<Kokkos::Cuda> { using node_details_t = GraphNodeBackendSpecificDetails<Kokkos::Cuda>; - void _instantiate_graph() { + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector<std::shared_ptr<void>> m_driver_storage; + + public: + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; @@ -60,10 +67,10 @@ struct GraphImpl<Kokkos::Cuda> { ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); + KOKKOS_ENSURES(m_graph_exec); // TODO @graphs print out errors } - public: using root_node_impl_t = GraphNodeImpl<Kokkos::Cuda, Kokkos::Experimental::TypeErasedTag, Kokkos::Experimental::TypeErasedTag>; @@ -72,13 +79,13 @@ struct GraphImpl<Kokkos::Cuda> { GraphNodeImpl<Kokkos::Cuda, aggregate_kernel_impl_t, Kokkos::Experimental::TypeErasedTag>; - // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to @@ -115,12 +122,9 @@ struct GraphImpl<Kokkos::Cuda> { template <class NodeImpl> // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl - // Also requires that the kernel has the graph node tag in it's policy + // Also requires that the kernel has the graph node tag in its policy void add_node(std::shared_ptr<NodeImpl> const& arg_node_ptr) { - static_assert( - NodeImpl::kernel_type::Policy::is_graph_kernel::value, - "Something has gone horribly wrong, but it's too complicated to " - "explain here. Buy Daisy a coffee and she'll explain it to you."); + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(bool(arg_node_ptr)); // The Kernel launch from the execute() method has been shimmed to insert // the node into the graph @@ -132,6 +136,8 @@ struct GraphImpl<Kokkos::Cuda> { kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); + if (std::shared_ptr<void> tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } template <class NodeImplPtr, class PredecessorRef> @@ -161,13 +167,13 @@ struct GraphImpl<Kokkos::Cuda> { &cuda_node, 1))); } - void submit() { + void submit(const execution_space& exec) { if (!bool(m_graph_exec)) { - _instantiate_graph(); + instantiate(); } KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() - ->cuda_graph_launch_wrapper(m_graph_exec))); + (exec.impl_internal_space_instance()->cuda_graph_launch_wrapper( + m_graph_exec))); } execution_space const& get_execution_space() const noexcept { @@ -200,6 +206,9 @@ struct GraphImpl<Kokkos::Cuda> { m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } + + cudaGraph_t cuda_graph() { return m_graph; } + cudaGraphExec_t cuda_graph_exec() { return m_graph_exec; } }; } // end namespace Impl diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index d7f853d9910264126160790b3c443fe667841d68..ec5768a7f0f60151d2095eb12b99846d307b7801 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include <Kokkos_Core.hpp> -#include <Cuda/Kokkos_Cuda_Error.hpp> -#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> -#include <Cuda/Kokkos_Cuda_Instance.hpp> -#include <Cuda/Kokkos_Cuda_UniqueToken.hpp> +// #include <Cuda/Kokkos_Cuda_Error.hpp> +// #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> +// #include <Cuda/Kokkos_Cuda_Instance.hpp> +// #include <Cuda/Kokkos_Cuda_UniqueToken.hpp> #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Tools.hpp> #include <impl/Kokkos_CheckedIntegerOps.hpp> @@ -97,21 +97,21 @@ __global__ void query_cuda_kernel_arch(int *d_arch) { } /** Query what compute capability is actually launched to the device: */ -int cuda_kernel_arch() { +int cuda_kernel_arch(int device_id) { int arch = 0; int *d_arch = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper( - reinterpret_cast<void **>(&d_arch), sizeof(int)))); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - d_arch, &arch, sizeof(int), cudaMemcpyDefault))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault)); query_cuda_kernel_arch<<<1, 1>>>(d_arch); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - &arch, d_arch, sizeof(int), cudaMemcpyDefault))); KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_free_wrapper(d_arch))); + cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(d_arch)); return arch; } @@ -135,7 +135,6 @@ Kokkos::View<uint32_t *, Kokkos::CudaSpace> cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -168,25 +167,11 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, }); } -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>( - name, reason, [&]() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper( - stream))); - }); -} - void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper<false>(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper<false>(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -196,10 +181,8 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file, void cuda_internal_error_abort(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper<false>(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper<false>(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -208,96 +191,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file, host_abort(out.str().c_str()); } -//---------------------------------------------------------------------------- -// Some significant cuda device properties: -// -// cudaDeviceProp::name : Text label for device -// cudaDeviceProp::major : Device major number -// cudaDeviceProp::minor : Device minor number -// cudaDeviceProp::warpSize : number of threads per warp -// cudaDeviceProp::multiProcessorCount : number of multiprocessors -// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block -// cudaDeviceProp::totalConstMem : capacity of constant memory -// cudaDeviceProp::totalGlobalMem : capacity of global memory -// cudaDeviceProp::maxGridSize[3] : maximum grid size - -// -// Section 4.4.2.4 of the CUDA Toolkit Reference Manual -// -// struct cudaDeviceProp { -// char name[256]; -// size_t totalGlobalMem; -// size_t sharedMemPerBlock; -// int regsPerBlock; -// int warpSize; -// size_t memPitch; -// int maxThreadsPerBlock; -// int maxThreadsDim[3]; -// int maxGridSize[3]; -// size_t totalConstMem; -// int major; -// int minor; -// int clockRate; -// size_t textureAlignment; -// int deviceOverlap; -// int multiProcessorCount; -// int kernelExecTimeoutEnabled; -// int integrated; -// int canMapHostMemory; -// int computeMode; -// int concurrentKernels; -// int ECCEnabled; -// int pciBusID; -// int pciDeviceID; -// int tccDriver; -// int asyncEngineCount; -// int unifiedAddressing; -// int memoryClockRate; -// int memoryBusWidth; -// int l2CacheSize; -// int maxThreadsPerMultiProcessor; -// }; - -namespace { - -class CudaInternalDevices { - public: - enum { MAXIMUM_DEVICE_COUNT = 64 }; - struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT]; - int m_cudaDevCount; - - CudaInternalDevices(); - - static const CudaInternalDevices &singleton(); -}; - -CudaInternalDevices::CudaInternalDevices() { - // See 'cudaSetDeviceFlags' for host-device thread interaction - // Section 4.4.2.6 of the CUDA Toolkit Reference Manual - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_count_wrapper<false>( - &m_cudaDevCount))); - - if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { - Kokkos::abort( - "Sorry, you have more GPUs per node than we thought anybody would ever " - "have. Please report this to github.com/kokkos/kokkos."); - } - for (int i = 0; i < m_cudaDevCount; ++i) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_properties_wrapper<false>( - m_cudaProp + i, i))); - } -} - -const CudaInternalDevices &CudaInternalDevices::singleton() { - static CudaInternalDevices self; - return self; -} - -} // namespace - //---------------------------------------------------------------------------- int Impl::CudaInternal::concurrency() { @@ -307,8 +200,6 @@ int Impl::CudaInternal::concurrency() { } void CudaInternal::print_configuration(std::ostream &s) const { - const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); - #if defined(KOKKOS_ENABLE_CUDA) s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif @@ -317,22 +208,23 @@ void CudaInternal::print_configuration(std::ostream &s) const { << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif - for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { - s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name - << " capability " << dev_info.m_cudaProp[i].major << "." - << dev_info.m_cudaProp[i].minor << ", Total Global Memory: " - << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + for (int i : get_visible_devices()) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability " + << prop.major << "." << prop.minor + << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem) << ", Shared Memory per Block: " - << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + << human_memory_size(prop.sharedMemPerBlock); if (m_cudaDev == i) s << " : Selected"; - s << std::endl; + s << '\n'; } } //---------------------------------------------------------------------------- CudaInternal::~CudaInternal() { - if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified) { + if (m_scratchSpace || m_scratchFlags || m_scratchUnified) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; } @@ -370,45 +262,53 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { +void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; + // Check that the device associated with the stream matches cuda_device + CUcontext context; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); + + m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast<void **>(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); + //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { - const unsigned reduce_block_count = - m_maxWarpCount * Impl::CudaTraits::WarpSize; + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + auto const maxWarpCount = std::min<unsigned>( + m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize, + CudaTraits::WarpSize); + unsigned const reduce_block_count = + maxWarpCount * Impl::CudaTraits::WarpSize; (void)scratch_unified(16 * sizeof(size_type)); (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) { - desul::Impl::init_lock_arrays(); // FIXME - } - - // Allocate a staging buffer for constant mem in pinned host memory - // and an event to avoid overwriting driver for previous kernel launches - if (this == &singleton()) { - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( - reinterpret_cast<void **>(&constantMemHostStaging), - CudaTraits::ConstantMemoryUsage))); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_create_wrapper(&constantMemReusable))); - } - - m_stream = stream; - m_manage_stream = manage_stream; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -427,22 +327,23 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast<size_type *>(r->data()); + m_scratchFlags = static_cast<size_type *>( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); } @@ -453,21 +354,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast<size_type *>(r->data()); + m_scratchSpace = static_cast<size_type *>( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -476,23 +375,20 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { if (verify_is_initialized("scratch_unified") && m_scratchUnifiedCount < scratch_count(size)) { - m_scratchUnifiedCount = scratch_count(size); + auto mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>; + if (m_scratchUnified) { + mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + } - if (m_scratchUnified) - Record::decrement(Record::get_record(m_scratchUnified)); + m_scratchUnifiedCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain); - Record *const r = - Record::allocate(Kokkos::CudaHostPinnedSpace(), - "Kokkos::InternalScratchUnified", alloc_size); - - Record::increment(r); - - m_scratchUnified = reinterpret_cast<size_type *>(r->data()); + m_scratchUnified = static_cast<size_type *>( + mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size)); } return m_scratchUnified; @@ -500,21 +396,16 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = - Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - if (m_scratchFunctor) - Record::decrement(Record::get_record(m_scratchFunctor)); + if (m_scratchFunctor) { + mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } - Record *const r = - Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - - Record::increment(r); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast<size_type *>(r->data()); + m_scratchFunctor = static_cast<size_type *>(mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); } return m_scratchFunctor; @@ -537,21 +428,21 @@ void *CudaInternal::resize_team_scratch_space(int scratch_pool_id, // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (m_team_scratch_current_size[scratch_pool_id] == 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc<Kokkos::CudaSpace>( - "Kokkos::CudaSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc<Kokkos::CudaSpace>( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", bytes); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -568,50 +459,33 @@ void CudaInternal::finalize() { was_finalized = true; - // Only finalize this if we're the singleton - if (this == &singleton()) { - (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_free_host_wrapper(constantMemHostStaging))); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_destroy_wrapper(constantMemReusable))); - auto &deep_copy_space = - Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); - if (deep_copy_space) - deep_copy_space->impl_internal_space_instance()->finalize(); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream()))); - } - + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>; - - RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags)); - RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace)); - RecordHost::decrement(RecordHost::get_record(m_scratchUnified)); - if (m_scratchFunctorSize > 0) - RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); + auto host_mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + cuda_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + cuda_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + host_mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + if (m_scratchFunctorSize > 0) { + cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } } for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) - Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]); + cuda_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); } - if (m_manage_stream && get_stream() != nullptr) - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream))); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchUnifiedCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; m_scratchUnified = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -624,30 +498,6 @@ void CudaInternal::finalize() { //---------------------------------------------------------------------------- -Cuda::size_type cuda_internal_multiprocessor_count() { - return CudaInternal::singleton().m_multiProcCount; -} - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() { -#if defined(KOKKOS_ARCH_KEPLER) - // Compute capability 3.0 through 3.7 - enum : int { max_resident_blocks_per_multiprocessor = 16 }; -#else - // Compute capability 5.0 through 6.2 - enum : int { max_resident_blocks_per_multiprocessor = 32 }; -#endif - return CudaInternal::singleton().m_multiProcCount * - max_resident_blocks_per_multiprocessor; -}; - -Cuda::size_type cuda_internal_maximum_warp_count() { - return CudaInternal::singleton().m_maxWarpCount; -} - -std::array<Cuda::size_type, 3> cuda_internal_maximum_grid_count() { - return CudaInternal::singleton().m_maxBlock; -} - Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, const std::size_t size) { return instance.impl_internal_space_instance()->scratch_space(size); @@ -670,10 +520,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, namespace Kokkos { -Cuda::size_type Cuda::detect_device_count() { - return Impl::CudaInternalDevices::singleton().m_cudaDevCount; -} - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int Cuda::concurrency() { #else @@ -687,25 +533,23 @@ int Cuda::impl_is_initialized() { } void Cuda::impl_initialize(InitializationSettings const &settings) { - const int cuda_device_id = Impl::get_gpu(settings); - const auto &dev_info = Impl::CudaInternalDevices::singleton(); + const std::vector<int> &visible_devices = Impl::get_visible_devices(); + const int cuda_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); - const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; - - Impl::CudaInternal::m_cudaDev = cuda_device_id; + cudaDeviceProp cudaProp; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGetDeviceProperties(&cudaProp, cuda_device_id)); Impl::CudaInternal::m_deviceProp = cudaProp; - - Kokkos::Impl::cuda_device_synchronize( - "Kokkos::CudaInternal::initialize: Fence on space initialization"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Query what compute capability architecture a kernel executes: - Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(); + Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); if (Impl::CudaInternal::m_cudaArch == 0) { - std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; - std::string msg = ss.str(); - Kokkos::abort(msg.c_str()); + Kokkos::abort( + "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"); } int compiled_major = Impl::CudaInternal::m_cudaArch / 100; @@ -762,76 +606,56 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default #endif //---------------------------------- - // number of multiprocessors - Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount; - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::CudaInternal::m_maxWarpCount = - cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize; - - if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) { - Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize; +#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY + // Check if unified memory is available + int cuda_result; + cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess, + cuda_device_id); + if (cuda_result == 0) { + Kokkos::abort( + "Kokkos::Cuda::initialize ERROR: Unified memory is not available on " + "this device\n" + "Please recompile Kokkos with " + "-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n"); } - - //---------------------------------- - // Maximum number of blocks: - - Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0]; - Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1]; - Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2]; - - Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor; - Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock; - Impl::CudaInternal::m_maxBlocksPerSM = - Impl::CudaInternal::m_cudaArch < 500 - ? 16 - : (Impl::CudaInternal::m_cudaArch < 750 - ? 32 - : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32)); - Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor; - Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock; +#endif //---------------------------------- cudaStream_t singleton_stream; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_create_wrapper( - &singleton_stream))); - - auto &cuda_singleton = Impl::CudaInternal::singleton(); - cuda_singleton.initialize(singleton_stream, /*manage*/ true); -} - -std::vector<unsigned> Cuda::detect_device_arch() { - const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton(); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); - std::vector<unsigned> output(s.m_cudaDevCount); - - for (int i = 0; i < s.m_cudaDevCount; ++i) { - output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor; - } + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); // FIXME - return output; + Impl::CudaInternal::singleton().initialize(singleton_stream); } -Cuda::size_type Cuda::device_arch() { - const int dev_id = Impl::CudaInternal::singleton().m_cudaDev; - - int dev_arch = 0; +void Cuda::impl_finalize() { + (void)Impl::cuda_global_unique_token_locks(true); + desul::Impl::finalize_lock_arrays(); // FIXME - if (0 <= dev_id) { - const struct cudaDeviceProp &cudaProp = - Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id]; - - dev_arch = cudaProp.major * 100 + cudaProp.minor; + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); } - return dev_arch; -} + auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); -void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } + Impl::CudaInternal::singleton().finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream)); +} Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton(), @@ -845,13 +669,17 @@ KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream) manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) { + ptr->finalize(); + if (static_cast<bool>(manage_stream)) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(stream, static_cast<bool>(manage_stream)); + m_space_instance->initialize(stream); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { @@ -859,16 +687,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << " KOKKOS_ENABLE_CUDA: yes\n"; os << "Cuda Options:\n"; - os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA - os << "yes\n"; -#else - os << "no\n"; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - os << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; - os << "yes\n"; -#endif os << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE os << "yes\n"; @@ -880,12 +698,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC @@ -893,6 +705,10 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { #else os << "no\n"; #endif +#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY + os << " KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: "; + os << "yes\n"; +#endif os << "\nCuda Runtime Configuration:\n"; @@ -925,6 +741,14 @@ namespace Impl { int g_cuda_space_factory_initialized = initialize_space_factory<Cuda>("150_Cuda"); +int CudaInternal::m_cudaArch = -1; +cudaDeviceProp CudaInternal::m_deviceProp; +std::set<int> CudaInternal::cuda_devices = {}; +std::map<int, unsigned long *> CudaInternal::constantMemHostStagingPerDevice = + {}; +std::map<int, cudaEvent_t> CudaInternal::constantMemReusablePerDevice = {}; +std::map<int, std::mutex> CudaInternal::constantMemMutexPerDevice = {}; + } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index a324adecfeb03e4a0b56111c7978717b9ed77ea9..ffaa0f54749f94bfdd88c40cf7b26e325e7fd383 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -22,6 +22,10 @@ #include <atomic> #include <Cuda/Kokkos_Cuda_Error.hpp> #include <cuda_runtime_api.h> +#include "Kokkos_CudaSpace.hpp" + +#include <set> +#include <map> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -55,27 +59,10 @@ struct CudaTraits { unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( - CudaSpace::size_type i) { - return (i + WarpIndexMask) >> WarpIndexShift; - } - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align( - CudaSpace::size_type i) { - constexpr CudaSpace::size_type Mask = ~WarpIndexMask; - return (i + WarpIndexMask) & Mask; - } }; //---------------------------------------------------------------------------- -CudaSpace::size_type cuda_internal_multiprocessor_count(); -CudaSpace::size_type cuda_internal_maximum_warp_count(); -std::array<CudaSpace::size_type, 3> cuda_internal_maximum_grid_count(); - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count(); - CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&, const std::size_t size); CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&, @@ -101,21 +88,13 @@ class CudaInternal { public: using size_type = Cuda::size_type; - inline static int m_cudaDev = -1; + int m_cudaDev = -1; // Device Properties - inline static int m_cudaArch = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0}; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxBlocksPerSM = 0; - inline static int m_maxThreadsPerSM = 0; - inline static int m_maxThreadsPerBlock = 0; + static int m_cudaArch; static int concurrency(); - inline static cudaDeviceProp m_deviceProp; + static cudaDeviceProp m_deviceProp; // Scratch Spaces for Reductions mutable std::size_t m_scratchSpaceCount; @@ -129,7 +108,6 @@ class CudaInternal { mutable size_type* m_scratchFunctor; cudaStream_t m_stream; uint32_t m_instance_id; - bool m_manage_stream; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -142,11 +120,10 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + static std::set<int> cuda_devices; + static std::map<int, unsigned long*> constantMemHostStagingPerDevice; + static std::map<int, cudaEvent_t> constantMemReusablePerDevice; + static std::map<int, std::mutex> constantMemMutexPerDevice; static CudaInternal& singleton(); @@ -156,7 +133,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(cudaStream_t stream, bool manage_stream); + void initialize(cudaStream_t stream); void finalize(); void print_configuration(std::ostream&) const; @@ -247,12 +224,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template <bool setCudaDevice = true> - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template <bool setCudaDevice = true> cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); @@ -290,37 +261,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template <bool setCudaDevice = true> - cudaError_t cuda_get_device_count_wrapper(int* count) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceCount(count); - } - - template <bool setCudaDevice = true> - cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop, - int device) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceProperties(prop, device); - } - - template <bool setCudaDevice = true> - const char* cuda_get_error_name_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorName(error); - } - - template <bool setCudaDevice = true> - const char* cuda_get_error_string_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorString(error); - } - - template <bool setCudaDevice = true> - cudaError_t cuda_get_last_error_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetLastError(); - } - template <bool setCudaDevice = true> cudaError_t cuda_graph_add_dependencies_wrapper( cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, @@ -480,23 +420,6 @@ class CudaInternal { return cudaStreamSynchronize(stream); } - // The following are only available for cuda 11.2 and greater -#if (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - template <bool setCudaDevice = true> - cudaError_t cuda_malloc_async_wrapper(void** devPtr, size_t size, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaMallocAsync(devPtr, size, get_input_stream(hStream)); - } - - template <bool setCudaDevice = true> - cudaError_t cuda_free_async_wrapper(void* devPtr, - cudaStream_t hStream = nullptr) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaFreeAsync(devPtr, get_input_stream(hStream)); - } -#endif - // C++ API routines template <typename T, bool setCudaDevice = true> cudaError_t cuda_func_get_attributes_wrapper(cudaFuncAttributes* attr, @@ -506,10 +429,10 @@ class CudaInternal { } template <typename T, bool setCudaDevice = true> - cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr, - int value) const { + cudaError_t cuda_func_set_attribute_wrapper(T* entry, cudaFuncAttribute attr, + int value) const { if constexpr (setCudaDevice) set_cuda_device(); - return cudaFuncSetAttributes(entry, attr, value); + return cudaFuncSetAttribute(entry, attr, value); } template <bool setCudaDevice = true> diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 82a72b690218246f231ff86946effcad26ffc9ae..2d00e735cb9d82e44e024baf08ff543829b84c25 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -21,7 +21,6 @@ #ifdef KOKKOS_ENABLE_CUDA #include <mutex> -#include <string> #include <cstdint> #include <cmath> #include <Kokkos_Parallel.hpp> @@ -118,42 +117,43 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { } inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { + int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( - std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" - " memory request is too large")); + "CudaParallelLaunch (or graph node creation) FAILED: shared memory " + "request is too large"); } } // These functions need to be templated on DriverType and LaunchBounds // so that the static bool is unique for each type combo // KernelFuncPtr does not necessarily contain that type information. -// FIXME_CUDA_MULTIPLE_DEVICES template <class DriverType, class LaunchBounds, class KernelFuncPtr> const cudaFuncAttributes& get_cuda_kernel_func_attributes( - const KernelFuncPtr& func) { + int cuda_device, const KernelFuncPtr& func) { // Only call cudaFuncGetAttributes once for each unique kernel // by leveraging static variable initialization rules - auto wrap_get_attributes = [&]() -> cudaFuncAttributes { + static std::map<int, cudaFuncAttributes> func_attr; + if (func_attr.find(cuda_device) == func_attr.end()) { cudaFuncAttributes attr; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr, - func))); - return attr; - }; - static cudaFuncAttributes func_attr = wrap_get_attributes(); - return func_attr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); + func_attr.emplace(cuda_device, attr); + } + return func_attr[cuda_device]; } template <class DriverType, class LaunchBounds, class KernelFuncPtr> -inline void configure_shmem_preference(const KernelFuncPtr& func, +inline void configure_shmem_preference(const int cuda_device, + const KernelFuncPtr& func, const cudaDeviceProp& device_props, const size_t block_size, int& shmem, const size_t occupancy) { #ifndef KOKKOS_ARCH_KEPLER const auto& func_attr = - get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func); + get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(cuda_device, + func); // Compute limits for number of blocks due to registers/SM const size_t regs_per_sm = device_props.regsPerMultiprocessor; @@ -209,8 +209,8 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // Use multiples of 8kB const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; size_t carveout = shmem_per_block == 0 - ? 0 - : 100 * + ? 0 + : 100 * (((num_blocks_desired * shmem_per_block + min_shmem_size_per_sm - 1) / min_shmem_size_per_sm) * @@ -222,7 +222,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // FIXME_CUDA_MULTIPLE_DEVICES auto set_cache_config = [&] { KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_set_attributes_wrapper( + (CudaInternal::singleton().cuda_func_set_attribute_wrapper( func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout))); return carveout; }; @@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference<DriverType, LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } void const* args[] = {&driver}; @@ -487,11 +487,14 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference<DriverType, LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel( + CudaSpace::impl_create(cuda_instance->m_cudaDev, + cuda_instance->m_stream), + driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl @@ -576,13 +579,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard<std::mutex> lock(CudaInternal::constantMemMutex); + std::lock_guard<std::mutex> lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +603,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node( @@ -665,8 +671,8 @@ struct CudaParallelLaunchImpl< Impl::configure_shmem_preference< DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } desul::ensure_cuda_lock_arrays_on_device(); @@ -675,18 +681,17 @@ struct CudaParallelLaunchImpl< base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_instance->cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); cuda_instance->fence( "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error"); #endif } } - static cudaFuncAttributes get_cuda_func_attributes() { + static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) { return get_cuda_kernel_func_attributes< DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>( - base_t::get_kernel_func()); + cuda_device, base_t::get_kernel_func()); } }; @@ -712,7 +717,7 @@ struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism, CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>; template <class... Args> CudaParallelLaunch(Args&&... args) { - base_t::launch_kernel((Args &&) args...); + base_t::launch_kernel((Args&&)args...); } }; @@ -726,7 +731,7 @@ struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism, CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>; template <class... Args> CudaParallelLaunch(Args&&... args) { - base_t::create_parallel_launch_graph_node((Args &&) args...); + base_t::create_parallel_launch_graph_node((Args&&)args...); } }; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp index 7492ab49e56b903f5b326c8672c546a5877cdc85..2c7eba7a18ffce4847794f25d4fb3c2a0f2c5dbb 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -40,8 +40,8 @@ template <> inline TileSizeProperties get_tile_size_properties<Kokkos::Cuda>( const Kokkos::Cuda& space) { TileSizeProperties properties; - properties.max_threads = - space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.max_threads = space.impl_internal_space_instance() + ->m_deviceProp.maxThreadsPerMultiProcessor; properties.default_largest_tile_size = 16; properties.default_tile_size = 2; properties.max_total_tile_size = 512; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 8aae27d091f268eb464eed1f9408624d6261cc7d..c50ff430345c3895b8358574ce6dd2b4371910ca 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -28,7 +28,6 @@ #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> #include <Cuda/Kokkos_Cuda_ReduceScan.hpp> #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> -#include <Kokkos_MinMaxClamp.hpp> #include <impl/Kokkos_Tools.hpp> #include <typeinfo> @@ -42,8 +41,8 @@ namespace Impl { template <typename ParallelType, typename Policy, typename LaunchBounds> int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { cudaFuncAttributes attr = - CudaParallelLaunch<ParallelType, - LaunchBounds>::get_cuda_func_attributes(); + CudaParallelLaunch<ParallelType, LaunchBounds>::get_cuda_func_attributes( + pol.space().cuda_device()); auto const& prop = pol.space().cuda_device_prop(); // Limits due to registers/SM, MDRange doesn't have @@ -96,11 +95,39 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = cuda_internal_maximum_grid_count(); + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxthreads = m_rp.space().cuda_device_prop().maxThreadsDim; + [[maybe_unused]] const auto maxThreadsPerBlock = + m_rp.space().cuda_device_prop().maxThreadsPerBlock; + // make sure the Z dimension (it is less than x,y limits) isn't exceeded + const auto clampZ = [&](const int input) { + return (input > maxthreads[2] ? maxthreads[2] : input); + }; + // make sure the block dimensions don't exceed the max number of threads + // allowed + const auto check_block_sizes = [&]([[maybe_unused]] const dim3& block) { + KOKKOS_ASSERT(block.x > 0 && + block.x <= static_cast<unsigned int>(maxthreads[0])); + KOKKOS_ASSERT(block.y > 0 && + block.y <= static_cast<unsigned int>(maxthreads[1])); + KOKKOS_ASSERT(block.z > 0 && + block.z <= static_cast<unsigned int>(maxthreads[2])); + KOKKOS_ASSERT(block.x * block.y * block.z <= + static_cast<unsigned int>(maxThreadsPerBlock)); + }; + // make sure the grid dimensions don't exceed the max number of blocks + // allowed + const auto check_grid_sizes = [&]([[maybe_unused]] const dim3& grid) { + KOKKOS_ASSERT(grid.x > 0 && + grid.x <= static_cast<unsigned int>(maxblocks[0])); + KOKKOS_ASSERT(grid.y > 0 && + grid.y <= static_cast<unsigned int>(maxblocks[1])); + KOKKOS_ASSERT(grid.z > 0 && + grid.z <= static_cast<unsigned int>(maxblocks[2])); + }; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); + check_block_sizes(block); const dim3 grid( std::min<array_index_type>( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -109,13 +136,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, maxblocks[1]), 1); + check_grid_sizes(grid); CudaParallelLaunch<ParallelFor, LaunchBounds>( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], clampZ(m_rp.m_tile[2])); + check_block_sizes(block); const dim3 grid( std::min<array_index_type>( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -126,15 +152,16 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { std::min<array_index_type>( (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); + // ensure we don't exceed the capability of the device + check_grid_sizes(grid); CudaParallelLaunch<ParallelFor, LaunchBounds>( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + clampZ(m_rp.m_tile[3])); + check_block_sizes(block); const dim3 grid( std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -144,14 +171,15 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { std::min<array_index_type>( (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch<ParallelFor, LaunchBounds>( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); + m_rp.m_tile[2] * m_rp.m_tile[3], clampZ(m_rp.m_tile[4])); + check_block_sizes(block); const dim3 grid( std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -160,6 +188,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { std::min<array_index_type>( (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch<ParallelFor, LaunchBounds>( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { @@ -167,7 +196,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); + clampZ(m_rp.m_tile[4] * m_rp.m_tile[5])); + check_block_sizes(block); const dim3 grid( std::min<array_index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -175,6 +205,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { maxblocks[1]), std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch<ParallelFor, LaunchBounds>( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { @@ -309,6 +340,11 @@ class ParallelReduce<CombinedFunctorReducerType, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); + } else { + // In the above call to final(), shared might have been updated by a + // single thread within a warp without synchronization. Synchronize + // threads within warp to avoid potential race condition. + __syncwarp(0xffffffff); } for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { @@ -320,19 +356,18 @@ class ParallelReduce<CombinedFunctorReducerType, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; + int const maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; int shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>( f, n); using closure_type = Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch<closure_type, - LaunchBounds>::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast<unsigned>( Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 5226c48bd9acc842e0095e9da334e230f1466fbd..8251fcb248d37dccda369b92b76de179649c147c 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -28,7 +28,6 @@ #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> #include <Cuda/Kokkos_Cuda_ReduceScan.hpp> #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> -#include <Kokkos_MinMaxClamp.hpp> #include <impl/Kokkos_Tools.hpp> #include <typeinfo> @@ -49,7 +48,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - ParallelFor() = delete; + ParallelFor() = delete; ParallelFor& operator=(const ParallelFor&) = delete; template <class TagType> @@ -86,18 +85,18 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); cudaFuncAttributes attr = - CudaParallelLaunch<ParallelFor, - LaunchBounds>::get_cuda_func_attributes(); + CudaParallelLaunch<ParallelFor, LaunchBounds>::get_cuda_func_attributes( + m_policy.space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, 0, 0); KOKKOS_ASSERT(block_size > 0); dim3 block(1, block_size, 1); + const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0]; dim3 grid( - std::min( - typename Policy::index_type((nwork + block.y - 1) / block.y), - typename Policy::index_type(cuda_internal_maximum_grid_count()[0])), + std::min(typename Policy::index_type((nwork + block.y - 1) / block.y), + typename Policy::index_type(maxGridSizeX)), 1, 1); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { @@ -243,6 +242,12 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); + } else if (word_count.value > 1) { + // Inside cuda_single_inter_block_reduce_scan() and final() above, + // shared[i] below might have been updated by a single thread within a + // warp without synchronization afterwards. Synchronize threads within + // warp to avoid potential race condition. + __syncwarp(0xffffffff); } for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { @@ -254,19 +259,18 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; int shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, value_type>( f, n); using closure_type = Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch<closure_type, - LaunchBounds>::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch<closure_type, LaunchBounds>:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast<unsigned>( Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( @@ -308,8 +312,9 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, // REQUIRED ( 1 , N , 1 ) dim3 block(1, block_size, 1); // Required grid.x <= block.y - dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, - 1); + dim3 grid(std::min(index_type(block.y), + index_type((nwork + block.y - 1) / block.y)), + 1, 1); // TODO @graph We need to effectively insert this in to the graph const int shmem = @@ -609,11 +614,11 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem<true, WorkTag, value_type>(f, n)) { n >>= 1; @@ -933,11 +938,11 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem<true, WorkTag, value_type>(f, n)) { n >>= 1; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 498e57f94a7536863eaa73fb4bf89e43adeb40f2..a2955e3ab61d07325e57df6cd9e3be75ca250eff 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -32,7 +32,7 @@ #include <Cuda/Kokkos_Cuda_ReduceScan.hpp> #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> #include <Cuda/Kokkos_Cuda_Team.hpp> -#include <Kokkos_MinMaxClamp.hpp> +#include <Kokkos_MinMax.hpp> #include <Kokkos_Vectorization.hpp> #include <impl/Kokkos_Tools.hpp> @@ -98,7 +98,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...> Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; cudaFuncAttributes attr = CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); int block_size = Kokkos::Impl::cuda_get_max_block_size<FunctorType, typename traits::launch_bounds>( @@ -137,7 +137,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...> Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; cudaFuncAttributes attr = CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size<FunctorType, typename traits::launch_bounds>( @@ -262,7 +262,8 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...> m_tune_team(bool(team_size_request <= 0)), m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0])) + const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0]; + if (league_size_ >= maxGridSizeX) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution " "space."); @@ -369,7 +370,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...> cudaFuncAttributes attr = CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = std::forward<BlockSizeCallable>(block_size_callable)( space().impl_internal_space_instance(), attr, f, (size_t)impl_vector_length(), @@ -538,17 +539,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - cudaFuncAttributes attr = - CudaParallelLaunch<ParallelFor, - LaunchBounds>::get_cuda_func_attributes(); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( - internal_space_instance, attr, m_functor, m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor<Cuda, TeamPolicy> could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -575,21 +573,16 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, static_cast<std::int64_t>(m_league_size)))); } + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock, - shmem_size_total); + if (maxShmemPerBlock < shmem_size_total) { + printf("%i %i\n", maxShmemPerBlock, shmem_size_total); Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } - if (int(m_team_size) > - int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( - internal_space_instance, attr, arg_functor, - arg_policy.impl_vector_length(), - arg_policy.team_scratch_size(0), - arg_policy.thread_scratch_size(0)) / - arg_policy.impl_vector_length())) { + if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); } @@ -623,11 +616,27 @@ class ParallelReduce<CombinedFunctorReducerType, public: using functor_type = FunctorType; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::Cuda::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the reduction is performed. + // Within the reduction, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the reduction, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::Cuda::size_type), + std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, + Kokkos::Cuda::size_type>; using size_type = Cuda::size_type; using reducer_type = ReducerType; static constexpr bool UseShflReduction = - (true && (ReducerType::static_value_size() != 0)); + ReducerType::static_value_size() != 0; private: struct ShflReductionTag {}; @@ -646,9 +655,11 @@ class ParallelReduce<CombinedFunctorReducerType, const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type* m_unified_space; + word_size_type* m_scratch_space; + // m_scratch_flags must be of type Cuda::size_type due to use of atomics + // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp + Cuda::size_type* m_scratch_flags; + word_size_type* m_unified_space; size_type m_team_begin; size_type m_shmem_begin; size_type m_shmem_size; @@ -691,15 +702,17 @@ class ParallelReduce<CombinedFunctorReducerType, } } - __device__ inline void run(SHMEMReductionTag&, const int& threadid) const { - const integral_nonzero_constant< - size_type, ReducerType::static_value_size() / sizeof(size_type)> + __device__ inline void run(SHMEMReductionTag, const int& threadid) const { + const integral_nonzero_constant<word_size_type, + ReducerType::static_value_size() / + sizeof(word_size_type)> word_count(m_functor_reducer.get_reducer().value_size() / - sizeof(size_type)); + sizeof(word_size_type)); - reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory<size_type>() + - threadIdx.y * word_count.value); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast<pointer_type>( + kokkos_impl_cuda_shared_memory<word_size_type>() + + threadIdx.y * word_count.value)); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -721,18 +734,19 @@ class ParallelReduce<CombinedFunctorReducerType, if (!zero_length) do_final_reduction = cuda_single_inter_block_reduce_scan<false>( m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, + kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space, m_scratch_flags); if (do_final_reduction) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + - (blockDim.y - 1) * word_count.value; + word_size_type* const shared = + kokkos_impl_cuda_shared_memory<word_size_type>() + + (blockDim.y - 1) * word_count.value; size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast<size_type*>(m_result_ptr) + ? reinterpret_cast<word_size_type*>(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -742,6 +756,11 @@ class ParallelReduce<CombinedFunctorReducerType, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); + } else { + // In the above call to final(), shared might have been updated by a + // single thread within a warp without synchronization. Synchronize + // threads within warp to avoid potential race condition. + __syncwarp(0xffffffff); } for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { @@ -782,7 +801,8 @@ class ParallelReduce<CombinedFunctorReducerType, *result = value; } else if (Impl::cuda_inter_block_reduction( value, init, m_functor_reducer.get_reducer(), - m_scratch_space, result, m_scratch_flags, blockDim.y)) { + reinterpret_cast<pointer_type>(m_scratch_space), result, + m_scratch_flags, blockDim.y)) { const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { m_functor_reducer.get_reducer().final(&value); @@ -803,13 +823,15 @@ class ParallelReduce<CombinedFunctorReducerType, 1u, UseShflReduction ? std::min(m_league_size, size_type(1024 * 32)) : std::min(int(m_league_size), m_team_size)); - m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), - m_functor_reducer.get_reducer().value_size() * block_count); + m_scratch_space = + reinterpret_cast<word_size_type*>(cuda_internal_scratch_space( + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * block_count)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), m_functor_reducer.get_reducer().value_size()); + m_unified_space = + reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified( + m_policy.space(), m_functor_reducer.get_reducer().value_size())); dim3 block(m_vector_size, m_team_size, 1); dim3 grid(block_count, 1, 1); @@ -842,7 +864,8 @@ class ParallelReduce<CombinedFunctorReducerType, } } else { const int size = m_functor_reducer.get_reducer().value_size(); - DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); + DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), m_result_ptr, + m_scratch_space, size); } } } @@ -878,18 +901,16 @@ class ParallelReduce<CombinedFunctorReducerType, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - cudaFuncAttributes attr = - CudaParallelLaunch<ParallelReduce, - LaunchBounds>::get_cuda_func_attributes(); - m_team_size = - m_team_size >= 0 - ? m_team_size - : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( - internal_space_instance, attr, - m_functor_reducer.get_functor(), m_vector_size, - m_policy.team_scratch_size(0), - m_policy.thread_scratch_size(0)) / - m_vector_size; + + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy> could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction @@ -935,6 +956,8 @@ class ParallelReduce<CombinedFunctorReducerType, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && @@ -943,7 +966,7 @@ class ParallelReduce<CombinedFunctorReducerType, std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (maxShmemPerBlock < shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " "L0 scratch memory")); diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 7ccedbfe28daf16ecd985601b78c52df4bf83686..3037c4ab5414a0b326aad9c3199f2e4155f4d377 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -103,7 +103,7 @@ template <class FunctorType> __device__ bool cuda_inter_block_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, const FunctorType& reducer, - Cuda::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, Cuda::size_type* const m_scratch_flags, const int max_active_thread = blockDim.y) { @@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + pointer_type global = m_scratch_space + blockIdx.x; *global = value; } @@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction( last_block = true; value = neutral; - pointer_type const volatile global = (pointer_type)m_scratch_space; + pointer_type const volatile global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = @@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy, unsigned reqShmemSize = cuda_single_inter_block_reduce_scan_shmem<false, WorkTag, ValueType>( functor, minBlockSize); - size_t maxShmemPerBlock = - policy.space().impl_internal_space_instance()->m_maxShmemPerBlock; + size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock; if (reqShmemSize > maxShmemPerBlock) { Kokkos::Impl::throw_runtime_exception( diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index baff7ef3f553ade083ce82721a5714f68964209d..5090e84c38cc635866020f14491567621eaaff5c 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -31,6 +31,9 @@ //---------------------------------------------------------------------------- +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #if defined(__CUDA_ARCH__) #define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ { \ @@ -84,8 +87,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> { KOKKOS_INLINE_FUNCTION static void iff_single_thread_recursive_execute(scheduler_type const&) {} - static int get_max_team_count(execution_space const&) { - return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block; + static int get_max_team_count(execution_space const& space) { + return space.cuda_device_prop().multiProcessorCount * warps_per_block; } __device__ static void driver(scheduler_type scheduler, @@ -225,7 +228,11 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> { // FIXME_CUDA_MULTIPLE_DEVICES static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda& exec = scheduler.get_execution_space(); + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + exec.cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; const cudaStream_t stream = nullptr; @@ -245,34 +252,30 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> { // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 1 << 11; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<<grid, block, shared_total, stream>>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::" "Cuda>::execute: Post Task Execution"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -300,8 +303,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> { set_cuda_task_base_apply_function_pointer<TaskType> <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::" "Cuda>::execute: Post Get Function Pointer for Tasks"); @@ -466,7 +469,13 @@ class TaskQueueSpecializationConstrained< static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda exec = Cuda(); // FIXME_CUDA_MULTIPLE_DEVICES + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + // FIXME not sure why this didn't work + // exec.cuda_device_prop().multiProcessorCount; + impl_instance->m_deviceProp.multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); // const dim3 grid( 1 , 1 , 1 ); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; @@ -482,34 +491,30 @@ class TaskQueueSpecializationConstrained< // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 2048; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<<grid, block, shared_total, stream>>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<" "Kokkos::Cuda>::execute: Post Execute Task"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -532,8 +537,7 @@ class TaskQueueSpecializationConstrained< set_cuda_task_base_apply_function_pointer<TaskType> <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<" "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer"); @@ -583,9 +587,9 @@ class TaskExec<Kokkos::Cuda, Scheduler> { private: enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; TaskExec& operator=(TaskExec const&) = delete; friend class Kokkos::Impl::TaskQueue< @@ -1223,5 +1227,7 @@ KOKKOS_INLINE_FUNCTION void single( #undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index c2b5f1fa78945beba56fcfdd78052a8b3e8de7f4..aec692c2c3669494b06fa08b18bb4a5764d1df55 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -184,24 +184,37 @@ class CudaTeamMember { * ( 1 == blockDim.z ) */ template <typename ReducerType> - KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<ReducerType>> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template <typename ReducerType> - KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<ReducerType>> team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE(( + typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;)) + } + + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<WrappedReducerType>> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>, - ReducerType, typename ReducerType::value_type>::Reducer - wrapped_reducer(reducer); - cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); - reducer.reference() = value;)) + (cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);)) } //-------------------------------------------------------------------------- @@ -260,23 +273,42 @@ class CudaTeamMember { //---------------------------------------- template <typename ReducerType> - KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer_v<ReducerType>> vector_reduce(ReducerType const& reducer) { vector_reduce(reducer, reducer.reference()); } template <typename ReducerType> - KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer_v<ReducerType>> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE( + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Cuda>, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_vector_reduce(wrapped_reducer, value); + reducer.reference() = value;)) + } + + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer_v<WrappedReducerType>> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( (if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; unsigned mask = blockDim.x == 32 @@ -287,7 +319,7 @@ class CudaTeamMember { for (int i = blockDim.x; (i >>= 1);) { Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); if ((int)threadIdx.x < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -297,7 +329,7 @@ class CudaTeamMember { // and thus different threads could have different results. Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); - value = tmp2; reducer.reference() = tmp2;)) + value = tmp2;)) } //---------------------------------------- @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { KOKKOS_IF_ON_DEVICE( - (typename ReducerType::value_type value; + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::CudaTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - reducer.init(value); + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value);)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); reducer.reference() = value;)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; + KOKKOS_IF_ON_DEVICE( - (ValueType val; Kokkos::Sum<ValueType> reducer(val); + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::CudaTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value{}; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; - i < loop_boundaries.end; i += blockDim.y) { closure(i, val); } + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference();)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } template <typename iType, class Closure> @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value; - reducer.init(value); + KOKKOS_IF_ON_DEVICE( + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::CudaTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, value); } + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value;)) - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value);)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum<ValueType> reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::CudaTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, val); } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference();)) + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) } //---------------------------------------------------------------------------- @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< Closure const& closure, ReducerType const& reducer) { KOKKOS_IF_ON_DEVICE(( - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::CudaTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.x) { closure(i, reducer.reference()); } + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } - Impl::CudaTeamMember::vector_reduce(reducer); + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value; )) // Avoid bogus warning about reducer value being uninitialized with combined @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (result = ValueType(); - for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; i += blockDim.x) { closure(i, result); } + KOKKOS_IF_ON_DEVICE(( + + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::CudaTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - Impl::CudaTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result)); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - )) + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index abb747e39a1066d8f826e4ed51de1faaaaa6930b..94a428493f47d5f0516e5fbeed646e271ccc56b8 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -22,7 +22,6 @@ #include <Cuda/Kokkos_CudaSpace.hpp> #include <Kokkos_UniqueToken.hpp> -#include <impl/Kokkos_SharedAlloc.hpp> namespace Kokkos { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index a3f4f2f4cccf5dbdbb1abd951cd4f515394bc3dc..9e0c5819f712e3861490dc7006d9824f27ba03e1 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -125,8 +125,8 @@ struct in_place_shfl_op { struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> { template <class T> __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -136,28 +136,28 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> { }; template <class... Args> __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> { template <class T> __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { return __shfl_up_sync(mask, val, lane, width); } }; template <class... Args> __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> { template <class T> __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -168,7 +168,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> { template <class... Args> __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index a945a716bc336b3fe691a9db7909502491ff25ec..c7ea6988a5d06f13b35efeccb18feac535704bd9 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,7 +77,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, inline void execute() { const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + m_policy.space().cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared = 0; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index c7f0d12d914bf37f82fe30219cf0d8b9e31a28c2..0ac2d4052d2a432476e7d62f12e5ebe3699a4e41 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -23,24 +23,12 @@ namespace Kokkos { namespace Impl { -template <class T, class... P> -struct ZeroMemset<Kokkos::Cuda, View<T, P...>> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { +template <> +struct ZeroMemset<Kokkos::Cuda> { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, void* dst, size_t cnt) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() - ->cuda_memset_async_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View<T, P...>::value_type)))); - } - - ZeroMemset(const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { - // FIXME_CUDA_MULTIPLE_DEVICES - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View<T, P...>::value_type)))); + ->cuda_memset_async_wrapper(dst, 0, cnt))); } }; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp index c8d6641d1ee7ae66899d2a76caaad6d12cbaaeec..18aca15065ea1ab0041c6f4a11b2a6276085cdb7 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -28,35 +28,20 @@ extern "C" { /* Cuda runtime function, declared in <crt/device_runtime.h> * Requires capability 2.x or better. */ -extern __device__ void __assertfail(const void *message, const void *file, - unsigned int line, const void *function, - size_t charsize); +[[noreturn]] __device__ void __assertfail(const void *message, const void *file, + unsigned int line, + const void *function, + size_t charsize); } namespace Kokkos { namespace Impl { -// required to workaround failures in random number generator unit tests with -// pre-volta architectures -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) -__device__ inline void cuda_abort(const char *const message) { -#else -[[noreturn]] __device__ inline void cuda_abort(const char *const message) { -#endif +[[noreturn]] __device__ static void cuda_abort(const char *const message) { const char empty[] = ""; __assertfail((const void *)message, (const void *)empty, (unsigned int)0, (const void *)empty, sizeof(char)); - - // This loop is never executed. It's intended to suppress warnings that the - // function returns, even though it does not. This is necessary because - // __assertfail is not marked as [[noreturn]], even though it does not return. - // Disable with KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK to workaround failures - // in random number generator unit tests with pre-volta architectures -#if !defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - while (true) - ; -#endif } } // namespace Impl diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp index f78bfd28b2f2d6a6cf61eedbd916bab386b6ffc4..8de3a8758fa11b3e8c913137edbf2c8994af0828 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -18,6 +18,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif +#include <Kokkos_Core.hpp> #include <HIP/Kokkos_HIP.hpp> #include <HIP/Kokkos_HIP_Instance.hpp> @@ -26,6 +27,8 @@ #include <hip/hip_runtime_api.h> +#include <iostream> + namespace Kokkos { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -41,39 +44,51 @@ int HIP::impl_is_initialized() { } void HIP::impl_initialize(InitializationSettings const& settings) { - const int hip_device_id = Impl::get_gpu(settings); + const std::vector<int>& visible_devices = Impl::get_visible_devices(); + const int hip_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( hipGetDeviceProperties(&Impl::HIPInternal::m_deviceProp, hip_device_id)); - const auto& hipProp = Impl::HIPInternal::m_deviceProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(hip_device_id)); - // number of multiprocessors - Impl::HIPInternal::m_multiProcCount = hipProp.multiProcessorCount; + // Check that we are running on the expected architecture. We print a warning + // instead of erroring out because AMD does not guarantee that gcnArchName + // will always contain the gfx flag. + if (Kokkos::show_warnings()) { + if (std::string_view arch_name = + Impl::HIPInternal::m_deviceProp.gcnArchName; + arch_name.find(KOKKOS_ARCH_AMD_GPU) != 0) { + std::cerr + << "Kokkos::HIP::initialize WARNING: running kernels compiled for " + << KOKKOS_ARCH_AMD_GPU << " on " << arch_name << " device.\n"; + } + } - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::HIPInternal::m_maxWarpCount = - hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; - if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) { - Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize; + // Print a warning if the user did not select the right GFX942 architecture +#ifdef KOKKOS_ARCH_AMD_GFX942 + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 1)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300X " + "(discrete GPU) on a MI300A (APU).\n"; + } +#endif +#ifdef KOKKOS_ARCH_AMD_GFX942_APU + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 0)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300A " + "(APU) on a MI300X (discrete GPU).\n"; } +#endif - //---------------------------------- - // Maximum number of blocks - Impl::HIPInternal::m_maxBlock[0] = hipProp.maxGridSize[0]; - Impl::HIPInternal::m_maxBlock[1] = hipProp.maxGridSize[1]; - Impl::HIPInternal::m_maxBlock[2] = hipProp.maxGridSize[2]; - - // theoretically, we can get 40 WF's / CU, but only can sustain 32 see - // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - Impl::HIPInternal::m_maxWavesPerCU = 32; - Impl::HIPInternal::m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - Impl::HIPInternal::m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + // theoretically on GFX 9XX GPUs, we can get 40 WF's / CU, but only can + // sustain 32 see + // https://github.com/ROCm/clr/blob/4d0b815d06751735e6a50fa46e913fdf85f751f0/hipamd/src/hip_platform.cpp#L362-L366 + const int maxWavesPerCU = + Impl::HIPInternal::m_deviceProp.major <= 9 ? 32 : 64; Impl::HIPInternal::m_maxThreadsPerSM = - Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; + maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME @@ -89,10 +104,23 @@ void HIP::impl_initialize(InitializationSettings const& settings) { hipStream_t singleton_stream; KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream)); - Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true); + Impl::HIPInternal::singleton().initialize(singleton_stream); } -void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } +void HIP::impl_finalize() { + (void)Impl::hip_global_unique_token_locks(true); + + desul::Impl::finalize_lock_arrays(); // FIXME + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipEventDestroy(Impl::HIPInternal::constantMemReusable)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipHostFree(Impl::HIPInternal::constantMemHostStaging)); + + Impl::HIPInternal::singleton().finalize(); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipStreamDestroy(Impl::HIPInternal::singleton().m_stream)); +} HIP::HIP() : m_space_instance(&Impl::HIPInternal::singleton(), @@ -102,13 +130,17 @@ HIP::HIP() } HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) { + ptr->finalize(); + if (static_cast<bool>(manage_stream)) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); - m_space_instance->initialize(stream, static_cast<bool>(manage_stream)); + m_space_instance->initialize(stream); } KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream) diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp index 61ed346b21825cd92d37b35c11b61a59a0c9fef7..439075fc6cc5fffc0fb191b893b7b3f4b0c7ce28 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp @@ -48,8 +48,19 @@ class HIP { using scratch_memory_space = ScratchMemorySpace<HIP>; HIP(); - HIP(hipStream_t stream, - Impl::ManageStream manage_stream = Impl::ManageStream::no); + + explicit HIP(hipStream_t stream) : HIP(stream, Impl::ManageStream::no) {} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT( + "HIP execution space should be constructed explicitly.") + HIP(hipStream_t stream) + : HIP(stream) {} +#endif + + HIP(hipStream_t stream, Impl::ManageStream manage_stream); + KOKKOS_DEPRECATED HIP(hipStream_t stream, bool manage_stream); //@} @@ -57,13 +68,15 @@ class HIP { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__HIP_DEVICE_COMPILE__) return true; #else return false; #endif } +#endif /** \brief Wait until all dispatched functors complete. * @@ -94,9 +107,13 @@ class HIP { static int impl_is_initialized(); - // static size_type device_arch(); - - static size_type detect_device_count(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 1f084c41e50e64a516e14c4f31b5f75856ca2bbe..90e5cf73559f1a9ee239556810050f3d96561362 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -113,8 +113,9 @@ unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, const unsigned min_waves_per_eu = LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; - const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; - unsigned block_size = tperb_reg; + const unsigned shmem_per_sm = + hip_instance->m_deviceProp.maxSharedMemoryPerMultiProcessor; + unsigned block_size = tperb_reg; do { unsigned total_shmem = f(block_size); // find how many threads we can fit with this blocksize based on LDS usage diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp index 43d63c090b3712afc0908fb4da5f7ef8fac2ed31..fa45dcfec315c086f11010888fb7f04b433f201f 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp @@ -22,8 +22,6 @@ #include <hip/hip_runtime.h> -#include <ostream> - namespace Kokkos { namespace Impl { @@ -44,39 +42,4 @@ inline void hip_internal_safe_call(hipError_t e, const char* name, #define KOKKOS_IMPL_HIP_SAFE_CALL(call) \ Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__) -namespace Kokkos { -namespace Experimental { - -class HIPRawMemoryAllocationFailure : public RawMemoryAllocationFailure { - private: - hipError_t m_error_code = hipSuccess; - - static FailureMode get_failure_mode(hipError_t error_code) { - switch (error_code) { - case hipErrorMemoryAllocation: return FailureMode::OutOfMemoryError; - case hipErrorInvalidValue: return FailureMode::InvalidAllocationSize; - default: return FailureMode::Unknown; - } - } - - public: - HIPRawMemoryAllocationFailure(size_t arg_attempted_size, - hipError_t arg_error_code, - AllocationMechanism arg_mechanism) noexcept - : RawMemoryAllocationFailure( - arg_attempted_size, /* HIPSpace doesn't handle alignment? */ 1, - get_failure_mode(arg_error_code), arg_mechanism), - m_error_code(arg_error_code) {} - - void append_additional_error_information(std::ostream& o) const override { - if (m_error_code != hipSuccess) { - o << " The HIP allocation returned the error code \"" - << hipGetErrorName(m_error_code) << "\"."; - } - } -}; - -} // namespace Experimental -} // namespace Kokkos - #endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 576c53426bca1feb38af7afb7ffdae7103d9257c..584cc63d958c8c62cb70d5f9ea7939a119aa24cf 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -20,13 +20,11 @@ #include <Kokkos_Graph_fwd.hpp> #include <impl/Kokkos_GraphImpl.hpp> -#include <impl/Kokkos_SharedAlloc.hpp> #include <Kokkos_Parallel.hpp> #include <Kokkos_Parallel_Reduce.hpp> #include <Kokkos_PointerOwnership.hpp> -#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp> #include <HIP/Kokkos_HIP_GraphNode_Impl.hpp> namespace Kokkos { @@ -43,26 +41,20 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...> using base_t = typename PatternImplSpecializationFromTag<PatternTag, Functor, Policy, Args..., Kokkos::HIP>::type; - using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>; // TODO use the name and executionspace template <typename PolicyDeduced, typename... ArgsDeduced> - GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, HIP const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} template <typename PolicyDeduced> GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Record::decrement(Record::get_record(m_driver_storage)); - } - } + : GraphNodeKernelImpl("[unlabeled]", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -76,24 +68,29 @@ class GraphNodeKernelImpl<Kokkos::HIP, PolicyType, Functor, PatternTag, Args...> hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer( + const HIP& exec) const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - - auto* record = Record::allocate( - Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast<base_t*>(record->data()); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr<base_t>( + static_cast<base_t*>( + HIPSpace().allocate(exec, alloc_label.c_str(), sizeof(base_t))), + // FIXME_HIP Custom deletor should use same 'exec' as for allocation. + [alloc_label](base_t* ptr) { + HIPSpace().deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr); - - return m_driver_storage; + return m_driver_storage.get(); } + auto get_driver_storage() const { return m_driver_storage; } + private: Kokkos::ObservingRawPtr<const hipGraph_t> m_graph_ptr = nullptr; Kokkos::ObservingRawPtr<hipGraphNode_t> m_graph_node_ptr = nullptr; - Kokkos::OwningRawPtr<base_t> m_driver_storage = nullptr; + mutable std::shared_ptr<base_t> m_driver_storage = nullptr; + std::string label; }; struct HIPGraphNodeAggregateKernel { @@ -123,13 +120,14 @@ struct get_graph_node_kernel_type<KernelType, Kokkos::ParallelReduceTag> Kokkos::ParallelReduceTag>> {}; template <typename KernelType> -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const HIP& exec, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type<KernelType>::type; auto const& kernel_as_graph_kernel = static_cast<graph_node_kernel_t const&>(kernel); - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(exec); } template <typename KernelType> diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index 3bde15444c7a847df6a92396bd379a30e08cf724..4f97214ca683f15f73a60ec83ed24d723ccde5ad 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -40,13 +40,13 @@ class GraphImpl<Kokkos::HIP> { GraphNodeImpl<Kokkos::HIP, aggregate_kernel_impl_t, Kokkos::Experimental::TypeErasedTag>; - // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); @@ -60,7 +60,7 @@ class GraphImpl<Kokkos::HIP> { template <class NodeImplPtr, class PredecessorRef> void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::HIP& exec); Kokkos::HIP const& get_execution_space() const noexcept; @@ -69,21 +69,31 @@ class GraphImpl<Kokkos::HIP> { template <class... PredecessorRefs> auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; hipGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + KOKKOS_ENSURES(m_graph_exec); } + hipGraph_t hip_graph() { return m_graph; } + hipGraphExec_t hip_graph_exec() { return m_graph_exec; } + + private: Kokkos::HIP m_execution_space; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; + + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector<std::shared_ptr<void>> m_driver_storage; }; -GraphImpl<Kokkos::HIP>::~GraphImpl() { +inline GraphImpl<Kokkos::HIP>::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(m_graph); if (m_graph_exec) { @@ -92,12 +102,12 @@ GraphImpl<Kokkos::HIP>::~GraphImpl() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph)); } -GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance) +inline GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance) : m_execution_space(std::move(instance)) { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0)); } -void GraphImpl<Kokkos::HIP>::add_node( +inline void GraphImpl<Kokkos::HIP>::add_node( std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node @@ -108,9 +118,9 @@ void GraphImpl<Kokkos::HIP>::add_node( } // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl -// Also requires that the kernel has the graph node tag in it's policy +// Also requires that the kernel has the graph node tag in its policy template <class NodeImpl> -void GraphImpl<Kokkos::HIP>::add_node( +inline void GraphImpl<Kokkos::HIP>::add_node( std::shared_ptr<NodeImpl> const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -123,14 +133,16 @@ void GraphImpl<Kokkos::HIP>::add_node( kernel.set_hip_graph_node_ptr(&node); kernel.execute(); KOKKOS_ENSURES(node); + if (std::shared_ptr<void> tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } // Requires PredecessorRef is a specialization of GraphNodeRef that has // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template <class NodeImplPtr, class PredecessorRef> -void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr, - PredecessorRef arg_pred_ref) { +inline void GraphImpl<Kokkos::HIP>::add_predecessor( + NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); KOKKOS_EXPECTS(pred_ptr); @@ -145,20 +157,19 @@ void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr, hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -void GraphImpl<Kokkos::HIP>::submit() { +inline void GraphImpl<Kokkos::HIP>::submit(const Kokkos::HIP& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - KOKKOS_IMPL_HIP_SAFE_CALL( - hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphLaunch(m_graph_exec, exec.hip_stream())); } -Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const - noexcept { +inline Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() + const noexcept { return m_execution_space; } -auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() { +inline auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() { KOKKOS_EXPECTS(m_graph); KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared<root_node_impl_t>(get_execution_space(), @@ -172,7 +183,7 @@ auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() { } template <class... PredecessorRefs> -auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) { +inline auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for // each predecessor ref, so all we need to do here is create the (trivial) diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 7f04eb721cb4e707fea6d5481935ff39124266b4..54e8c315e3f61522658ef42953cad6569f64cb03 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -27,6 +27,7 @@ #include <HIP/Kokkos_HIP.hpp> #include <HIP/Kokkos_HIP_Space.hpp> #include <impl/Kokkos_CheckedIntegerOps.hpp> +#include <impl/Kokkos_DeviceManagement.hpp> #include <impl/Kokkos_Error.hpp> /*--------------------------------------------------------------------------*/ @@ -76,7 +77,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + static int const concurrency = + m_maxThreadsPerSM * m_deviceProp.multiProcessorCount; return concurrency; } @@ -89,10 +91,21 @@ void HIPInternal::print_configuration(std::ostream &s) const { << '\n'; #endif - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); + s << "macro KOKKOS_ENABLE_ROCTHRUST : " +#if defined(KOKKOS_ENABLE_ROCTHRUST) + << "defined\n"; +#else + << "undefined\n"; +#endif + + s << "macro KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC: "; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + s << "yes\n"; +#else + s << "no\n"; +#endif - for (int i = 0; i < hipDevCount; ++i) { + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU"; @@ -159,22 +172,29 @@ void HIPInternal::fence(const std::string &name) const { [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); }); } -void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { +void HIPInternal::initialize(hipStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); - m_stream = stream; - m_manage_stream = manage_stream; + m_stream = stream; //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + unsigned int maxWarpCount = + m_deviceProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (Impl::HIPTraits::WarpSize < maxWarpCount) { + maxWarpCount = Impl::HIPTraits::WarpSize; + } + const unsigned reduce_block_count = - m_maxWarpCount * Impl::HIPTraits::WarpSize; + maxWarpCount * Impl::HIPTraits::WarpSize; (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); @@ -192,20 +212,19 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast<size_type *>(r->data()); + m_scratchSpace = static_cast<size_type *>( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -214,21 +233,23 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast<size_type *>(r->data()); + m_scratchFlags = static_cast<size_type *>( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } @@ -238,29 +259,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution( void const *driver, std::size_t const size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPHostPinnedSpace, void>; + Kokkos::HIPSpace device_mem_space; + Kokkos::HIPHostPinnedSpace host_mem_space; if (m_scratchFunctor) { - Record::decrement(Record::get_record(m_scratchFunctor)); - RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - RecordHost *const r_host = RecordHost::allocate( - Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); - - Record::increment(r); - RecordHost::increment(r_host); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast<size_type *>(r->data()); - m_scratchFunctorHost = reinterpret_cast<size_type *>(r_host->data()); + m_scratchFunctor = static_cast<size_type *>(device_mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); + m_scratchFunctorHost = static_cast<size_type *>(host_mem_space.allocate( + "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize)); } // When using HSA_XNACK=1, it is necessary to copy the driver to the host to @@ -323,23 +335,18 @@ void HIPInternal::finalize() { this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); was_finalized = true; - if (this == &singleton()) { - (void)Kokkos::Impl::hip_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); - } - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordHIP = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace>; + Kokkos::HIPSpace device_mem_space; - RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchSpaceCount * sizeScratchGrain); + device_mem_space.deallocate(m_scratchSpace, + m_scratchFlagsCount * sizeScratchGrain); if (m_scratchFunctorSize > 0) { - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + Kokkos::HIPHostPinnedSpace host_mem_space; + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } } @@ -348,14 +355,10 @@ void HIPInternal::finalize() { Kokkos::kokkos_free<Kokkos::HIPSpace>(m_team_scratch_ptr[i]); } - if (m_manage_stream && m_stream != nullptr) - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream)); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -366,18 +369,20 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } -//---------------------------------------------------------------------------- +int HIPInternal::m_hipDev = -1; +int HIPInternal::m_maxThreadsPerSM = 0; -Kokkos::HIP::size_type hip_internal_multiprocessor_count() { - return HIPInternal::singleton().m_multiProcCount; -} +hipDeviceProp_t HIPInternal::m_deviceProp; -Kokkos::HIP::size_type hip_internal_maximum_warp_count() { - return HIPInternal::singleton().m_maxWarpCount; -} +std::mutex HIPInternal::scratchFunctorMutex; +unsigned long *HIPInternal::constantMemHostStaging = nullptr; +hipEvent_t HIPInternal::constantMemReusable = nullptr; +std::mutex HIPInternal::constantMemMutex; -std::array<Kokkos::HIP::size_type, 3> hip_internal_maximum_grid_count() { - return HIPInternal::singleton().m_maxBlock; +//---------------------------------------------------------------------------- + +Kokkos::HIP::size_type hip_internal_multiprocessor_count() { + return HIPInternal::singleton().m_deviceProp.multiProcessorCount; } Kokkos::HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -419,13 +424,3 @@ void Kokkos::Impl::create_HIP_instances(std::vector<HIP> &instances) { instances[s] = HIP(stream, ManageStream::yes); } } - -//---------------------------------------------------------------------------- - -namespace Kokkos { -HIP::size_type HIP::detect_device_count() { - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); - return hipDevCount; -} -} // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index ef140ec46c061d6d90afb9bf9afee312b9d0527d..d8043dc23d7a25c290b423d68284fc167b808433 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -30,11 +30,13 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ - defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ -#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) +#elif defined(KOKKOS_ARCH_AMD_GFX1030) || defined(KOKKOS_ARCH_AMD_GFX1100) || \ + defined(KOKKOS_ARCH_AMD_GFX1103) static constexpr int WarpSize = 32; static constexpr int WarpIndexMask = 0x001f; /* hexadecimal for 31 */ static constexpr int WarpIndexShift = 5; /* WarpSize == 1 << WarpShift*/ @@ -50,8 +52,6 @@ struct HIPTraits { //---------------------------------------------------------------------------- -HIP::size_type hip_internal_maximum_warp_count(); -std::array<HIP::size_type, 3> hip_internal_maximum_grid_count(); HIP::size_type hip_internal_multiprocessor_count(); HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -69,16 +69,10 @@ class HIPInternal { public: using size_type = ::Kokkos::HIP::size_type; - inline static int m_hipDev = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array<size_type, 3> m_maxBlock = {0, 0, 0}; - inline static unsigned m_maxWavesPerCU = 0; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxThreadsPerSM = 0; + static int m_hipDev; + static int m_maxThreadsPerSM; - inline static hipDeviceProp_t m_deviceProp; + static hipDeviceProp_t m_deviceProp; static int concurrency(); @@ -91,13 +85,12 @@ class HIPInternal { size_type *m_scratchFlags = nullptr; mutable size_type *m_scratchFunctor = nullptr; mutable size_type *m_scratchFunctorHost = nullptr; - inline static std::mutex scratchFunctorMutex; + static std::mutex scratchFunctorMutex; hipStream_t m_stream = nullptr; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<HIP>( reinterpret_cast<uintptr_t>(this)); - bool m_manage_stream = false; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -111,9 +104,9 @@ class HIPInternal { // FIXME_HIP: these want to be per-device, not per-stream... use of 'static' // here will break once there are multiple devices though - inline static unsigned long *constantMemHostStaging = nullptr; - inline static hipEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + static unsigned long *constantMemHostStaging; + static hipEvent_t constantMemReusable; + static std::mutex constantMemMutex; static HIPInternal &singleton(); @@ -123,7 +116,7 @@ class HIPInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(hipStream_t stream, bool manage_stream); + void initialize(hipStream_t stream); void finalize(); void print_configuration(std::ostream &) const; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7cd0afcf47fc004776554f44e64b213b7c1a12e2..e243eb07e7844a441a0f3a6b54e5d29358b43361 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,11 +25,7 @@ #include <HIP/Kokkos_HIP_Instance.hpp> #include <HIP/Kokkos_HIP_Space.hpp> -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) -#define KOKKOS_IMPL_HIP_GRAPH_ENABLED -#endif - -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH #include <HIP/Kokkos_HIP_GraphNodeKernel.hpp> #include <impl/Kokkos_GraphImpl_fwd.hpp> #endif @@ -173,15 +169,15 @@ struct DeduceHIPLaunchMechanism { static constexpr HIPLaunchMechanism launch_mechanism = ((property & force_global_launch) == force_global_launch) ? HIPLaunchMechanism::GlobalMemory - : ((property & light_weight) == light_weight) - ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit - ? HIPLaunchMechanism::LocalMemory - : HIPLaunchMechanism::GlobalMemory) - : (((property & heavy_weight) == heavy_weight) - ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage - ? HIPLaunchMechanism::ConstantMemory - : HIPLaunchMechanism::GlobalMemory) - : (default_launch_mechanism)); + : ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); }; template <typename DriverType, typename LaunchBounds, @@ -384,7 +380,7 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds, driver); } -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const * /*hip_instance*/) { @@ -442,7 +438,7 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds, driver_ptr); } -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { @@ -453,15 +449,17 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds, KOKKOS_EXPECTS(!graph_node); if (!Impl::is_empty_launch(grid, block)) { - auto *driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto *driver_ptr = Impl::allocate_driver_storage_for_kernel( + HIP(hip_instance->m_stream, ManageStream::no), driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); void const *args[] = {&driver_ptr}; @@ -551,11 +549,11 @@ struct HIPParallelLaunch< LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, + const dim3 &block, const unsigned int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { + if (hip_instance->m_deviceProp.sharedMemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } @@ -585,7 +583,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds, @@ -628,8 +626,6 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, } // namespace Impl } // namespace Kokkos -#undef KOKKOS_IMPL_HIP_GRAPH_ENABLED - #endif #endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4c5afec398f93364e6be268daca73759a9459aa2 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp @@ -0,0 +1,173 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP + +#include <Kokkos_Parallel.hpp> + +#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <KokkosExp_MDRangePolicy.hpp> +#include <impl/KokkosExp_IterateTileGPU.hpp> + +namespace Kokkos { +namespace Impl { + +// ParallelFor +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + using functor_type = FunctorType; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType, + typename Policy::work_tag>(m_policy, + m_functor) + .exec_range(); + } + + inline void execute() const { + using ClosureType = ParallelFor<FunctorType, Policy, HIP>; + if (m_policy.m_num_tiles == 0) return; + auto const maxblocks = m_policy.space().hip_device_prop().maxGridSize; + if (Policy::rank == 2) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); + dim3 const grid( + std::min<array_index_type>( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min<array_index_type>( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + 1); + hip_parallel_launch<ClosureType, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 3) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], + m_policy.m_tile[2]); + dim3 const grid( + std::min<array_index_type>( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min<array_index_type>( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + std::min<array_index_type>( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch<ClosureType, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2], m_policy.m_tile[3]); + dim3 const grid( + std::min<array_index_type>( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min<array_index_type>( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / + block.y, + maxblocks[1]), + std::min<array_index_type>( + (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch<ClosureType, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 + // to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4]); + dim3 const grid( + std::min<array_index_type>( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min<array_index_type>( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min<array_index_type>( + (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch<ClosureType, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; + // id4,id5 to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4] * m_policy.m_tile[5]); + dim3 const grid( + std::min<array_index_type>( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min<array_index_type>( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min<array_index_type>( + m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); + hip_parallel_launch<ClosureType, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); + } + + } // end execute + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + using closure_type = + ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP>; + unsigned block_size = hip_get_max_blocksize<closure_type, LaunchBounds>(); + if (block_size == 0) + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "tile size.")); + return block_size; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3985dc60f06bd07289336a2a3cbf357141558821 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -0,0 +1,100 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP + +#include <Kokkos_Parallel.hpp> + +#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range( + const Member i) const { + m_functor(i); + } + + template <class TagType> + inline __device__ std::enable_if_t<!std::is_void<TagType>::value> exec_range( + const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range<WorkTag>(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + using DriverType = ParallelFor<FunctorType, Policy, Kokkos::HIP>; + const int block_size = + Kokkos::Impl::hip_get_preferred_blocksize<DriverType, LaunchBounds>(); + const dim3 block(1, block_size, 1); + const dim3 grid( + typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " + "valid execution configuration.")); + } + Kokkos::Impl::hip_parallel_launch<DriverType, LaunchBounds>( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..83e890bce99abb9f9e749f03062686d25b3d3a5a --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -0,0 +1,183 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP + +#include <Kokkos_Parallel.hpp> + +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_Team.hpp> +#include <HIP/Kokkos_HIP_Instance.hpp> +#include <HIP/Kokkos_HIP_TeamPolicyInternal.hpp> + +namespace Kokkos { +namespace Impl { + +template <typename FunctorType, typename... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> { + public: + using Policy = TeamPolicy<Properties...>; + using functor_type = FunctorType; + using size_type = HIP::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + + template <typename TagType> + __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team( + const member_type& member) const { + m_functor(member); + } + + template <typename TagType> + __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team( + const member_type& member) const { + m_functor(TagType(), member); + } + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + int const int_league_size = static_cast<int>(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<work_tag>(typename Policy::member_type( + kokkos_impl_hip_shared_memory<void>(), m_shmem_begin, m_shmem_size, + static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + inline void execute() const { + int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 const grid(static_cast<int>(m_league_size), 1, 1); + dim3 const block(static_cast<int>(m_vector_size), + static_cast<int>(m_team_size), 1); + + using closure_type = + ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP>; + Impl::hip_parallel_launch<closure_type, launch_bounds>( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor<HIP, TeamPolicy> could not find a " + "valid execution configuration."); + } + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast<std::int64_t>(m_scratch_size[1]) * + (std::min( + static_cast<std::int64_t>(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast<std::int64_t>(m_league_size)))); + } + + unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + } + + size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + } + } + + ~ParallelFor() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp similarity index 61% rename from packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp rename to packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp index 0fa325cb12c7250919fc0176f3533fca76f35bd5..162951164626fbde181016fa3d200c1b02126a95 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp @@ -14,157 +14,19 @@ // //@HEADER -#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP -#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP + +#include <Kokkos_Parallel.hpp> #include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> #include <HIP/Kokkos_HIP_KernelLaunch.hpp> #include <HIP/Kokkos_HIP_ReduceScan.hpp> #include <KokkosExp_MDRangePolicy.hpp> #include <impl/KokkosExp_IterateTileGPU.hpp> -#include <Kokkos_Parallel.hpp> namespace Kokkos { namespace Impl { -// ParallelFor -template <class FunctorType, class... Traits> -class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP> { - public: - using Policy = Kokkos::MDRangePolicy<Traits...>; - using functor_type = FunctorType; - - private: - using array_index_type = typename Policy::array_index_type; - using index_type = typename Policy::index_type; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType, - typename Policy::work_tag>(m_policy, - m_functor) - .exec_range(); - } - - inline void execute() const { - using ClosureType = ParallelFor<FunctorType, Policy, HIP>; - if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); - if (Policy::rank == 2) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); - dim3 const grid( - std::min<array_index_type>( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min<array_index_type>( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - 1); - hip_parallel_launch<ClosureType, LaunchBounds>( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 3) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], - m_policy.m_tile[2]); - dim3 const grid( - std::min<array_index_type>( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min<array_index_type>( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - std::min<array_index_type>( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch<ClosureType, LaunchBounds>( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2], m_policy.m_tile[3]); - dim3 const grid( - std::min<array_index_type>( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min<array_index_type>( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / - block.y, - maxblocks[1]), - std::min<array_index_type>( - (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch<ClosureType, LaunchBounds>( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 - // to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4]); - dim3 const grid( - std::min<array_index_type>( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min<array_index_type>( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min<array_index_type>( - (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch<ClosureType, LaunchBounds>( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; - // id4,id5 to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4] * m_policy.m_tile[5]); - dim3 const grid( - std::min<array_index_type>( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min<array_index_type>( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min<array_index_type>( - m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); - hip_parallel_launch<ClosureType, LaunchBounds>( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - } - - } // end execute - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - - template <typename Policy, typename Functor> - static int max_tile_size_product(const Policy&, const Functor&) { - using closure_type = - ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, HIP>; - unsigned block_size = hip_get_max_blocksize<closure_type, LaunchBounds>(); - if (block_size == 0) - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " - "tile size.")); - return block_size; - } -}; // ParallelReduce template <class CombinedFunctorReducerType, class... Traits> @@ -188,6 +50,7 @@ class ParallelReduce<CombinedFunctorReducerType, using value_type = typename ReducerType::value_type; using reference_type = typename ReducerType::reference_type; using functor_type = FunctorType; + using reducer_type = ReducerType; using size_type = HIP::size_type; // Conditionally set word_size_type to int16_t or int8_t if value_type is diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c8981866e8ac52cc77a987616e9236f433edeaeb --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp @@ -0,0 +1,329 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP + +#include <Kokkos_Parallel.hpp> + +#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_ReduceScan.hpp> +#include <HIP/Kokkos_HIP_Shuffle_Reduce.hpp> + +namespace Kokkos { +namespace Impl { + +template <class CombinedFunctorReducerType, class... Traits> +class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, + Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + public: + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; + using functor_type = FunctorType; + using reducer_type = ReducerType; + using size_type = Kokkos::HIP::size_type; + using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + + static constexpr bool UseShflReduction = false; + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template <class TagType> + __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(i, update); + } + + template <class TagType> + __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), i, update); + } + + public: + __device__ inline void operator()() const { + using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag, + SHMEMReductionTag>; + run(ReductionTag{}); + } + + __device__ inline void run(SHMEMReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + const integral_nonzero_constant<word_size_type, + ReducerType::static_value_size() / + sizeof(word_size_type)> + word_count(reducer.value_size() / sizeof(word_size_type)); + + { + reference_type value = reducer.init(reinterpret_cast<pointer_type>( + ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() + + threadIdx.y * word_count.value)); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>(iwork, value); + } + } + + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + bool do_final_reduction = m_policy.begin() == m_policy.end(); + if (!do_final_reduction) + do_final_reduction = hip_single_inter_block_reduce_scan<false>( + reducer, blockIdx.x, gridDim.x, + ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(), + m_scratch_space, m_scratch_flags); + if (do_final_reduction) { + // This is the final block with the final result at the final threads' + // location + + word_size_type* const shared = + ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<word_size_type*>(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast<value_type*>(shared)); + } + + if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + WorkRange const range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>(iwork, value); + } + + pointer_type const result = reinterpret_cast<pointer_type>(m_scratch_space); + + int max_active_thread = static_cast<int>(range.end() - range.begin()) < + static_cast<int>(blockDim.y) + ? range.end() - range.begin() + : blockDim.y; + + max_active_thread = + (max_active_thread == 0) ? blockDim.y : max_active_thread; + + value_type init; + reducer.init(&init); + if (m_policy.begin() == m_policy.end()) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<>( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, max_active_thread)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem<false, WorkTag, + value_type>(f, n); + }; + return Kokkos::Impl::hip_get_preferred_blocksize<ParallelReduce, + LaunchBounds>( + instance, shmem_functor); + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + !std::is_same<ReducerType, InvalidType>::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor_reducer.get_functor()); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid execution configuration.")); + } + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // use a slightly less constrained, but still well bounded limit for + // scratch + int nblocks = (nwork + block.y - 1) / block.y; + // Heuristic deciding the value of nblocks. + // The general idea here is we want to: + // 1. Not undersubscribe the device (i.e., we want at least + // preferred_block_min blocks) + // 2. Have each thread reduce > 1 value to minimize overheads + // 3. Limit the total # of blocks, to avoid unbounded scratch space + constexpr int block_max = 4096; + constexpr int preferred_block_min = 1024; + + if (nblocks < preferred_block_min) { + // keep blocks as is, already have low parallelism + } else if (nblocks > block_max) { + // "large dispatch" -> already have lots of parallelism + nblocks = block_max; + } else { + // in the intermediate range, try to have each thread process multiple + // items to offset the cost of the reduction (with not enough + // parallelism to hide it) + int items_per_thread = + (nwork + nblocks * block_size - 1) / (nblocks * block_size); + if (items_per_thread < 4) { + int ratio = std::min( + (nblocks + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + nblocks /= ratio; + } + } + + // TODO: down casting these uses more space than required? + m_scratch_space = + (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * nblocks); + // Intentionally do not downcast to word_size_type since we use HIP + // atomics in Kokkos_HIP_ReduceScan.hpp + m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( + m_policy.space(), sizeof(size_type)); + // Required grid.x <= block.y + dim3 grid(nblocks, 1, 1); + + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem<false, WorkTag, + value_type>( + m_functor_reducer.get_functor(), block.y); + + Kokkos::Impl::hip_parallel_launch<ParallelReduce, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible && m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<HIPSpace, + typename ViewType::memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ViewType::memory_space>::accessible) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fb4ff937cdff5b728ed87f57d0bb95a2b4b49fbb --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -0,0 +1,420 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP + +#include <Kokkos_Parallel.hpp> + +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_Team.hpp> +#include <HIP/Kokkos_HIP_Instance.hpp> +#include <HIP/Kokkos_HIP_TeamPolicyInternal.hpp> + +namespace Kokkos { +namespace Impl { + +template <class CombinedFunctorReducerType, class... Properties> +class ParallelReduce<CombinedFunctorReducerType, + Kokkos::TeamPolicy<Properties...>, HIP> { + public: + using Policy = TeamPolicy<Properties...>; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; + + public: + using functor_type = FunctorType; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the reduction is performed. + // Within the reduction, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the reduction, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::HIP::size_type), + std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, + Kokkos::HIP::size_type>; + using reducer_type = ReducerType; + using size_type = HIP::size_type; + + // static int constexpr UseShflReduction = false; + // FIXME_HIP This should be disabled unconditionally for best performance, but + // it currently causes tests to fail. + static constexpr int UseShflReduction = + (ReducerType::static_value_size() != 0); + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + word_size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template <class TagType> + __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(member, update); + } + + template <class TagType> + __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), member, update); + } + + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { + int const int_league_size = static_cast<int>(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<work_tag>( + member_type( + kokkos_impl_hip_shared_memory<char>() + m_team_begin, + m_shmem_begin, m_shmem_size, + reinterpret_cast<void*>( + reinterpret_cast<char*>(m_scratch_ptr[1]) + + static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + } + + int compute_block_count() const { + constexpr auto light_weight = + Kokkos::Experimental::WorkItemProperty::HintLightWeight; + constexpr typename Policy::work_item_property property; + // Numbers were tuned on MI210 using dot product and yAx benchmarks + constexpr int block_max = + (property & light_weight) == light_weight ? 2097152 : 65536; + constexpr int preferred_block_min = 1024; + int block_count = m_league_size; + if (block_count < preferred_block_min) { + // keep blocks as is, already low parallelism + } else if (block_count >= block_max) { + block_count = block_max; + + } else { + int nwork = m_league_size * m_team_size; + int items_per_thread = + (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); + if (items_per_thread < 4) { + int ratio = std::min( + (block_count + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + block_count /= ratio; + } + } + + return block_count; + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag, + SHMEMReductionTag>; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + integral_nonzero_constant<word_size_type, ReducerType::static_value_size() / + sizeof(word_size_type)> const + word_count(reducer.value_size() / sizeof(word_size_type)); + + reference_type value = reducer.init(reinterpret_cast<pointer_type>( + kokkos_impl_hip_shared_memory<word_size_type>() + + threadIdx.y * word_count.value)); + // Iterate this block through the league + iterate_through_league(threadid, value); + + // Reduce with final value at blockDim.y - 1 location. + bool do_final_reduce = (m_league_size == 0); + if (!do_final_reduce) + do_final_reduce = hip_single_inter_block_reduce_scan<false>( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory<word_size_type>(), m_scratch_space, + m_scratch_flags); + if (do_final_reduce) { + // This is the final block with the final result at the final threads' + // location + + word_size_type* const shared = + kokkos_impl_hip_shared_memory<word_size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<word_size_type*>(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast<value_type*>(shared)); + } + + if (HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : reinterpret_cast<pointer_type>(m_scratch_space); + + value_type init; + reducer.init(&init); + if (m_league_size == 0) { + reducer.final(&value); + *result = value; + } else if (Impl::hip_inter_block_shuffle_reduction( + value, init, reducer, + reinterpret_cast<pointer_type>(m_scratch_space), result, + m_scratch_flags, blockDim.y)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + *result = value; + } + } + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const bool is_empty_range = m_league_size == 0 || m_team_size == 0; + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + Policy::is_graph_kernel::value || + !std::is_same<ReducerType, InvalidType>::value; + if (!is_empty_range || need_device_set) { + int const block_count = compute_block_count(); + + m_scratch_space = + reinterpret_cast<word_size_type*>(hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count)); + m_scratch_flags = + hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + if (is_empty_range) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + Impl::hip_parallel_launch<ParallelReduce, launch_bounds>( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<HIPSpace, + typename ViewType::memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ViewType::memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce<HIP, TeamPolicy> could not find a " + "valid execution configuration."); + } + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem<false, work_tag, + value_type>( + arg_functor_reducer.get_functor(), m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor_reducer.get_functor(), m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast<std::int64_t>(m_scratch_size[1]) * + (std::min( + static_cast<std::int64_t>(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast<std::int64_t>(m_league_size)))); + } + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const unsigned int shmem_size_total = + m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " + "L0 scratch memory")); + } + + size_t max_size = arg_policy.team_size_max( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } + + ~ParallelReduce() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp similarity index 50% rename from packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp rename to packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp index 26e8be4698a85bf7075f5e2e24f6426c64acb827..41692a3291beeac9a10a9c59d8747822cddc75d9 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp @@ -14,390 +14,18 @@ // //@HEADER -#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP -#define KOKKO_HIP_PARALLEL_RANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP #include <Kokkos_Parallel.hpp> -#if defined(__HIPCC__) - #include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> #include <HIP/Kokkos_HIP_KernelLaunch.hpp> #include <HIP/Kokkos_HIP_ReduceScan.hpp> -#include <HIP/Kokkos_HIP_Shuffle_Reduce.hpp> -#include <impl/Kokkos_Traits.hpp> namespace Kokkos { namespace Impl { -template <class FunctorType, class... Traits> -class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy<Traits...>; - - private: - using Member = typename Policy::member_type; - using WorkTag = typename Policy::work_tag; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - template <class TagType> - inline __device__ std::enable_if_t<std::is_void<TagType>::value> exec_range( - const Member i) const { - m_functor(i); - } - - template <class TagType> - inline __device__ std::enable_if_t<!std::is_void<TagType>::value> exec_range( - const Member i) const { - m_functor(TagType(), i); - } - - public: - using functor_type = FunctorType; - - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - const Member work_stride = blockDim.y * gridDim.x; - const Member work_end = m_policy.end(); - - for (Member iwork = - m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; - iwork < work_end; - iwork = iwork < work_end - work_stride ? iwork + work_stride - : work_end) { - this->template exec_range<WorkTag>(iwork); - } - } - - inline void execute() const { - const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - - using DriverType = ParallelFor<FunctorType, Policy, Kokkos::HIP>; - const int block_size = - Kokkos::Impl::hip_get_preferred_blocksize<DriverType, LaunchBounds>(); - const dim3 block(1, block_size, 1); - const dim3 grid( - typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " - "valid execution configuration.")); - } - Kokkos::Impl::hip_parallel_launch<DriverType, LaunchBounds>( - *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), - false); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template <class CombinedFunctorReducerType, class... Traits> -class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, - Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy<Traits...>; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using LaunchBounds = typename Policy::launch_bounds; - - public: - using pointer_type = typename ReducerType::pointer_type; - using value_type = typename ReducerType::value_type; - using reference_type = typename ReducerType::reference_type; - using functor_type = FunctorType; - using reducer_type = ReducerType; - using size_type = Kokkos::HIP::size_type; - using index_type = typename Policy::index_type; - // Conditionally set word_size_type to int16_t or int8_t if value_type is - // smaller than int32_t (Kokkos::HIP::size_type) - // word_size_type is used to determine the word count, shared memory buffer - // size, and global memory buffer size before the scan is performed. - // Within the scan, the word count is recomputed based on word_size_type - // and when calculating indexes into the shared/global memory buffers for - // performing the scan, word_size_type is used again. - // For scalars > 4 bytes in size, indexing into shared/global memory relies - // on the block and grid dimensions to ensure that we index at the correct - // offset rather than at every 4 byte word; such that, when the join is - // performed, we have the correct data that was copied over in chunks of 4 - // bytes. - using word_size_type = std::conditional_t< - sizeof(value_type) < sizeof(size_type), - std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>; - - // Algorithmic constraints: blockSize is a power of two AND blockDim.y == - // blockDim.z == 1 - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - word_size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - - static constexpr bool UseShflReduction = false; - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Make the exec_range calls call to Reduce::DeviceIterateTile - template <class TagType> - __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(i, update); - } - - template <class TagType> - __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), i, update); - } - - public: - __device__ inline void operator()() const { - using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag, - SHMEMReductionTag>; - run(ReductionTag{}); - } - - __device__ inline void run(SHMEMReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - const integral_nonzero_constant<word_size_type, - ReducerType::static_value_size() / - sizeof(word_size_type)> - word_count(reducer.value_size() / sizeof(word_size_type)); - - { - reference_type value = reducer.init(reinterpret_cast<pointer_type>( - ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() + - threadIdx.y * word_count.value)); - - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of - // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically - // equivalent. - - const WorkRange range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range<WorkTag>(iwork, value); - } - } - - // Reduce with final value at blockDim.y - 1 location. - // Shortcut for length zero reduction - bool do_final_reduction = m_policy.begin() == m_policy.end(); - if (!do_final_reduction) - do_final_reduction = hip_single_inter_block_reduce_scan<false>( - reducer, blockIdx.x, gridDim.x, - ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>(), - m_scratch_space, m_scratch_flags); - if (do_final_reduction) { - // This is the final block with the final result at the final threads' - // location - - word_size_type* const shared = - ::Kokkos::kokkos_impl_hip_shared_memory<word_size_type>() + - (blockDim.y - 1) * word_count.value; - word_size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast<word_size_type*>(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast<value_type*>(shared)); - } - - if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of work - // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically equivalent. - - WorkRange const range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range<WorkTag>(iwork, value); - } - - pointer_type const result = reinterpret_cast<pointer_type>(m_scratch_space); - - int max_active_thread = static_cast<int>(range.end() - range.begin()) < - static_cast<int>(blockDim.y) - ? range.end() - range.begin() - : blockDim.y; - - max_active_thread = - (max_active_thread == 0) ? blockDim.y : max_active_thread; - - value_type init; - reducer.init(&init); - if (m_policy.begin() == m_policy.end()) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } else if (Impl::hip_inter_block_shuffle_reduction<>( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, max_active_thread)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } - } - } - - // Determine block size constrained by shared memory: - inline unsigned local_block_size(const FunctorType& f) { - const auto& instance = m_policy.space().impl_internal_space_instance(); - auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem<false, WorkTag, - value_type>(f, n); - }; - return Kokkos::Impl::hip_get_preferred_blocksize<ParallelReduce, - LaunchBounds>( - instance, shmem_functor); - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - !std::is_same<ReducerType, InvalidType>::value; - if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor_reducer.get_functor()); - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " - "valid execution configuration.")); - } - - // REQUIRED ( 1 , N , 1 ) - dim3 block(1, block_size, 1); - // use a slightly less constrained, but still well bounded limit for - // scratch - int nblocks = (nwork + block.y - 1) / block.y; - // Heuristic deciding the value of nblocks. - // The general idea here is we want to: - // 1. Not undersubscribe the device (i.e., we want at least - // preferred_block_min blocks) - // 2. Have each thread reduce > 1 value to minimize overheads - // 3. Limit the total # of blocks, to avoid unbounded scratch space - constexpr int block_max = 4096; - constexpr int preferred_block_min = 1024; - - if (nblocks < preferred_block_min) { - // keep blocks as is, already have low parallelism - } else if (nblocks > block_max) { - // "large dispatch" -> already have lots of parallelism - nblocks = block_max; - } else { - // in the intermediate range, try to have each thread process multiple - // items to offset the cost of the reduction (with not enough - // parallelism to hide it) - int items_per_thread = - (nwork + nblocks * block_size - 1) / (nblocks * block_size); - if (items_per_thread < 4) { - int ratio = std::min( - (nblocks + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - nblocks /= ratio; - } - } - - // TODO: down casting these uses more space than required? - m_scratch_space = - (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * nblocks); - // Intentionally do not downcast to word_size_type since we use HIP - // atomics in Kokkos_HIP_ReduceScan.hpp - m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( - m_policy.space(), sizeof(size_type)); - // Required grid.x <= block.y - dim3 grid(nblocks, 1, 1); - - if (nwork == 0) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem<false, WorkTag, - value_type>( - m_functor_reducer.get_functor(), block.y); - - Kokkos::Impl::hip_parallel_launch<ParallelReduce, LaunchBounds>( - *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute - - if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy<HostSpace, HIPSpace, HIP>(m_policy.space(), m_result_ptr, - m_scratch_space, size); - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template <class ViewType> - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess<HIPSpace, - typename ViewType::memory_space>::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess<Kokkos::HostSpace, - typename ViewType::memory_space>::accessible) {} -}; - template <class FunctorType, class ValueType, class... Traits> class ParallelScanHIPBase { public: @@ -763,5 +391,3 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, } // namespace Kokkos #endif - -#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp deleted file mode 100644 index 3fe568ac361f547bfd3eeb19526c1ad289b70380..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ /dev/null @@ -1,936 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP -#define KOKKO_HIP_PARALLEL_TEAM_HPP - -#include <Kokkos_Parallel.hpp> - -#if defined(__HIPCC__) - -#include <HIP/Kokkos_HIP_KernelLaunch.hpp> -#include <HIP/Kokkos_HIP_Team.hpp> -#include <HIP/Kokkos_HIP_Instance.hpp> -#include <Kokkos_MinMaxClamp.hpp> - -namespace Kokkos { -namespace Impl { - -template <typename... Properties> -class TeamPolicyInternal<HIP, Properties...> - : public PolicyTraits<Properties...> { - public: - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits<Properties...>; - - template <typename ExecSpace, typename... OtherProperties> - friend class TeamPolicyInternal; - - private: - typename traits::execution_space m_space; - int m_league_size; - int m_team_size; - int m_vector_length; - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; - int m_chunk_size; - bool m_tune_team_size; - bool m_tune_vector_length; - - public: - using execution_space = HIP; - - template <class... OtherProperties> - TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_vector_length = p.m_vector_length; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - m_space = p.m_space; - m_tune_team_size = p.m_tune_team_size; - m_tune_vector_length = p.m_tune_vector_length; - } - - template <typename FunctorType> - int team_size_max(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; - - return internal_team_size_common<BlockType::Max, closure_type, void>(f); - } - - template <class FunctorType> - inline int team_size_max(const FunctorType& f, - const ParallelReduceTag&) const { - using functor_analysis_type = - Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, - TeamPolicyInternal, FunctorType, void>; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer<FunctorType, - typename functor_analysis_type::Reducer>, - TeamPolicy<Properties...>, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Max, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template <typename FunctorType, typename ReducerType> - inline int team_size_max(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { - using closure_type = - Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>, - TeamPolicy<Properties...>, Kokkos::HIP>; - return internal_team_size_common<BlockType::Max, closure_type, - typename ReducerType::value_type>(f); - } - - template <typename FunctorType> - int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; - - return internal_team_size_common<BlockType::Preferred, closure_type, void>( - f); - } - - template <typename FunctorType> - inline int team_size_recommended(FunctorType const& f, - ParallelReduceTag const&) const { - using functor_analysis_type = - Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, - TeamPolicyInternal, FunctorType, void>; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer<FunctorType, - typename functor_analysis_type::Reducer>, - TeamPolicy<Properties...>, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Preferred, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template <typename FunctorType, typename ReducerType> - int team_size_recommended(FunctorType const& f, ReducerType const&, - ParallelReduceTag const&) const { - using closure_type = - Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>, - TeamPolicy<Properties...>, Kokkos::HIP>; - return internal_team_size_common<BlockType::Preferred, closure_type, - typename ReducerType::value_type>(f); - } - - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - static int vector_length_max() { return HIPTraits::WarpSize; } - - static int verify_requested_vector_length(int requested_vector_length) { - int test_vector_length = - std::min(requested_vector_length, vector_length_max()); - - // Allow only power-of-two vector_length - if (!(is_integral_power_of_two(test_vector_length))) { - int test_pow2 = 1; - constexpr int warp_size = HIPTraits::WarpSize; - while (test_pow2 < warp_size) { - test_pow2 <<= 1; - if (test_pow2 > test_vector_length) { - break; - } - } - test_vector_length = test_pow2 >> 1; - } - - return test_vector_length; - } - - inline static int scratch_size_max(int level) { - // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team - // reductions. They also use one int64_t in static shared memory for a - // shared ID. Furthermore, they use additional scratch memory in some - // reduction scenarios, which depend on the size of the value_type and is - // NOT captured here - constexpr size_t max_possible_team_size = 1024; - constexpr size_t max_reserved_shared_mem_per_team = - (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); - // arbitrarily setting level 1 scratch limit to 20MB, for a - // MI250 that would give us about 4.4GB for 2 teams per CU - constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; - - size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; - return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team - : max_l1_scratch_size); - } - - inline void impl_set_vector_length(size_t size) { m_vector_length = size; } - inline void impl_set_team_size(size_t size) { m_team_size = size; } - int impl_vector_length() const { return m_vector_length; } - - int team_size() const { return m_team_size; } - - int league_size() const { return m_league_size; } - - size_t scratch_size(int level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - size_t team_scratch_size(int level) const { - return m_team_scratch_size[level]; - } - - size_t thread_scratch_size(int level) const { - return m_thread_scratch_size[level]; - } - - typename traits::execution_space space() const { return m_space; } - - TeamPolicyInternal() - : m_space(typename traits::execution_space()), - m_league_size(0), - m_team_size(-1), - m_vector_length(0), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(false), - m_tune_vector_length(false) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length( - (vector_length_request > 0) - ? verify_requested_vector_length(vector_length_request) - : (verify_requested_vector_length(1))), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(bool(team_size_request <= 0)), - m_tune_vector_length(bool(vector_length_request <= 0)) { - // Make sure league size is permissible - if (league_size_ >= static_cast<int>(hip_internal_maximum_grid_count()[0])) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissible - if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} - // FLAG - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - ) - : TeamPolicyInternal(space_, league_size_, team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(space_, league_size_, -1, -1) - - {} - - TeamPolicyInternal(int league_size_, int team_size_request, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, vector_length_request) {} - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - vector_length_request) {} - - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(int league_size_, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - -1) {} - - int chunk_size() const { return m_chunk_size; } - - TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerTeamValue const& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerThreadValue const& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, - PerThreadValue const& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - using member_type = Kokkos::Impl::HIPTeamMember; - - protected: - template <BlockType BlockSize, class ClosureType, class ValueType, - class FunctorType> - int internal_team_size_common(FunctorType const& f) const { - const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); - unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); - using Tag = typename PatternTagFromImplSpecialization<ClosureType>::type; - if constexpr (std::is_same_v<Tag, ParallelReduceTag>) { - using Interface = - typename Impl::DeduceFunctorPatternInterface<ClosureType>::type; - using Analysis = - Impl::FunctorAnalysis<Interface, typename ClosureType::Policy, - FunctorType, ValueType>; - shmem_thread += - ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); - } - const int vector_length = impl_vector_length(); - - const auto functor = [&f, shmem_block, shmem_thread, vector_length]( - const hipFuncAttributes& attr, int block_size) { - int functor_shmem = - ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value( - f, block_size / vector_length); - return shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - }; - int block_size; - if constexpr (BlockSize == BlockType::Max) { - block_size = hip_get_max_team_blocksize<ClosureType, - typename traits::launch_bounds>( - space().impl_internal_space_instance(), functor); - } else { - block_size = - hip_get_preferred_team_blocksize<ClosureType, - typename traits::launch_bounds>( - space().impl_internal_space_instance(), functor); - } - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " - "team size.")); - } - if constexpr (std::is_same_v<Tag, ParallelForTag>) { - return block_size / impl_vector_length(); - } else { - // Currently we require Power-of-2 team size for reductions. - int p2 = 1; - while (p2 <= block_size) p2 *= 2; - p2 /= 2; - return p2 / impl_vector_length(); - } - } -}; - -__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, - int32_t* scratch_locks, - size_t num_scratch_locks) { - int64_t threadid = 0; - __shared__ int64_t base_thread_id; - if (threadIdx.x == 0 && threadIdx.y == 0) { - int64_t const wraparound_len = - Kokkos::min(int64_t(league_size), - int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (int64_t(threadid + blockDim.x * blockDim.y) >= - wraparound_len * blockDim.x * blockDim.y) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - return threadid; -} - -__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, - int64_t threadid) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - scratch_locks[threadid] = 0; - } -} - -template <typename FunctorType, typename... Properties> -class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> { - public: - using Policy = TeamPolicy<Properties...>; - using functor_type = FunctorType; - using size_type = HIP::size_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ team reduce space ] - // [ team shared space ] - - FunctorType const m_functor; - Policy const m_policy; - size_type const m_league_size; - int m_team_size; - size_type const m_vector_size; - int m_shmem_begin; - int m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - - template <typename TagType> - __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team( - const member_type& member) const { - m_functor(member); - } - - template <typename TagType> - __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team( - const member_type& member) const { - m_functor(TagType(), member); - } - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - __device__ inline void operator()() const { - // Iterate this block through the league - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - int const int_league_size = static_cast<int>(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team<work_tag>(typename Policy::member_type( - kokkos_impl_hip_shared_memory<void>(), m_shmem_begin, m_shmem_size, - static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) + - ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size)); - } - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - inline void execute() const { - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; - dim3 const grid(static_cast<int>(m_league_size), 1, 1); - dim3 const block(static_cast<int>(m_vector_size), - static_cast<int>(m_team_size), 1); - - using closure_type = - ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP>; - Impl::hip_parallel_launch<closure_type, launch_bounds>( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - } - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); - - m_shmem_begin = (sizeof(double) * (m_team_size + 2)); - m_shmem_size = - (m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); - m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - m_scratch_ptr[0] = nullptr; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast<std::int64_t>(m_scratch_size[1]) * - (std::min( - static_cast<std::int64_t>(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast<std::int64_t>(m_league_size)))); - } - - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); - } - - size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); - if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); - } - } - - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template <class CombinedFunctorReducerType, class... Properties> -class ParallelReduce<CombinedFunctorReducerType, - Kokkos::TeamPolicy<Properties...>, HIP> { - public: - using Policy = TeamPolicyInternal<HIP, Properties...>; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - using value_type = typename ReducerType::value_type; - - public: - using functor_type = FunctorType; - using size_type = HIP::size_type; - - // static int constexpr UseShflReduction = false; - // FIXME_HIP This should be disabled unconditionally for best performance, but - // it currently causes tests to fail. - static constexpr int UseShflReduction = - (ReducerType::static_value_size() != 0); - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ global reduce space ] - // [ team reduce space ] - // [ team shared space ] - // - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type m_team_begin; - size_type m_shmem_begin; - size_type m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - const size_type m_league_size; - int m_team_size; - const size_type m_vector_size; - - template <class TagType> - __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(member, update); - } - - template <class TagType> - __device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), member, update); - } - - __device__ inline void iterate_through_league(int const threadid, - reference_type value) const { - int const int_league_size = static_cast<int>(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team<work_tag>( - member_type( - kokkos_impl_hip_shared_memory<char>() + m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast<void*>( - reinterpret_cast<char*>(m_scratch_ptr[1]) + - static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } - } - - int compute_block_count() const { - constexpr auto light_weight = - Kokkos::Experimental::WorkItemProperty::HintLightWeight; - constexpr typename Policy::work_item_property property; - // Numbers were tuned on MI210 using dot product and yAx benchmarks - constexpr int block_max = - (property & light_weight) == light_weight ? 2097152 : 65536; - constexpr int preferred_block_min = 1024; - int block_count = m_league_size; - if (block_count < preferred_block_min) { - // keep blocks as is, already low parallelism - } else if (block_count >= block_max) { - block_count = block_max; - - } else { - int nwork = m_league_size * m_team_size; - int items_per_thread = - (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); - if (items_per_thread < 4) { - int ratio = std::min( - (block_count + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - block_count /= ratio; - } - } - - return block_count; - } - - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag, - SHMEMReductionTag>; - run(ReductionTag{}, threadid); - - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - __device__ inline void run(SHMEMReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - integral_nonzero_constant<size_type, ReducerType::static_value_size() / - sizeof(size_type)> const - word_count(reducer.value_size() / sizeof(size_type)); - - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory<size_type>() + - threadIdx.y * word_count.value); - // Iterate this block through the league - iterate_through_league(threadid, value); - - // Reduce with final value at blockDim.y - 1 location. - bool do_final_reduce = (m_league_size == 0); - if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory<size_type>(), m_scratch_space, - m_scratch_flags); - if (do_final_reduce) { - // This is the final block with the final result at the final threads' - // location - - size_type* const shared = kokkos_impl_hip_shared_memory<size_type>() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast<size_type*>(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast<value_type*>(shared)); - } - - if (HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - - // Iterate this block through the league - iterate_through_league(threadid, value); - - pointer_type const result = - m_result_ptr_device_accessible - ? m_result_ptr - : reinterpret_cast<pointer_type>(m_scratch_space); - - value_type init; - reducer.init(&init); - if (m_league_size == 0) { - reducer.final(&value); - *result = value; - } else if (Impl::hip_inter_block_shuffle_reduction( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, blockDim.y)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - *result = value; - } - } - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - Policy::is_graph_kernel::value || - !std::is_same<ReducerType, InvalidType>::value; - if (!is_empty_range || need_device_set) { - int const block_count = compute_block_count(); - - m_scratch_space = hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * block_count); - m_scratch_flags = - hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - - dim3 block(m_vector_size, m_team_size, 1); - dim3 grid(block_count, 1, 1); - if (is_empty_range) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - Impl::hip_parallel_launch<ParallelReduce, launch_bounds>( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().impl_internal_space_instance()->fence(); - - if (m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy<HostSpace, HIPSpace>(m_result_ptr, m_scratch_space, size); - } - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template <class ViewType> - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess<HIPSpace, - typename ViewType::memory_space>::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess<Kokkos::HostSpace, - typename ViewType::memory_space>::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - - m_team_begin = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem<false, work_tag, - value_type>( - arg_functor_reducer.get_functor(), m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize<FunctorType>::value( - arg_functor_reducer.get_functor(), m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast<std::int64_t>(m_scratch_size[1]) * - (std::min( - static_cast<std::int64_t>(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast<std::int64_t>(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for HIP for dynamic " - "sized reduction types."); - - if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 64 is not currently supported with HIP for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); - } - - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " - "L0 scratch memory")); - } - - size_t max_size = arg_policy.team_size_max( - arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); - } - } - - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; -} // namespace Impl -} // namespace Kokkos - -#endif - -#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ea599989e7ad9617e30678c0712dee1f95727752..0b679218092d9bba4da6c0ffbf156474fa925654 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -18,138 +18,18 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp> -#include <HIP/Kokkos_HIP_DeepCopy.hpp> #include <HIP/Kokkos_HIP.hpp> +#include <HIP/Kokkos_HIP_DeepCopy.hpp> +#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp> +#include <impl/Kokkos_SharedAlloc_timpl.hpp> -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord<void, void> - SharedAllocationRecord<HIPSpace, void>::s_root_record; - -SharedAllocationRecord<void, void> - SharedAllocationRecord<HIPHostPinnedSpace, void>::s_root_record; - -SharedAllocationRecord<void, void> - SharedAllocationRecord<HIPManagedSpace, void>::s_root_record; -#endif - -SharedAllocationRecord<HIPSpace, void>::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord<HIPHostPinnedSpace, void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size); -} - -SharedAllocationRecord<HIPManagedSpace, void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size); -} - -SharedAllocationRecord<HIPSpace, void>::SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<HIPSpace, void>::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - HIP exec; - Kokkos::Impl::DeepCopy<HIPSpace, HostSpace>( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord<Kokkos::HIPSpace, " - "void>::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord<HIPSpace, void>::SharedAllocationRecord( - const HIP& arg_exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<HIPSpace, void>::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy<HIPSpace, HostSpace>(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord<HIPHostPinnedSpace, void>::SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<HIPHostPinnedSpace, void>::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via host pinned memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord<HIPManagedSpace, void>::SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<HIPManagedSpace, void>::s_root_record, +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HIPSpace); #endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via managed memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPManagedSpace); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index e68bad9723079fcbf0666fa2945391db446f64c8..a464609108cdc89e611f07cca3e1f2f08f299e9b 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -18,120 +18,15 @@ #define KOKKOS_HIP_SHARED_ALLOCATION_RECORD_HPP #include <HIP/Kokkos_HIP_Space.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord<HIPSpace, void> - : public HostInaccessibleSharedAllocationRecordCommon<HIPSpace> { - private: - friend class SharedAllocationRecordCommon<HIPSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon<HIPSpace>; - using base_t = HostInaccessibleSharedAllocationRecordCommon<HIPSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec*/, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIP& exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord<HIPHostPinnedSpace, void> - : public SharedAllocationRecordCommon<HIPHostPinnedSpace> { - private: - friend class SharedAllocationRecordCommon<HIPHostPinnedSpace>; - using base_t = SharedAllocationRecordCommon<HIPHostPinnedSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); +#else +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::HIPSpace); #endif - - const HIPHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord<HIPManagedSpace, void> - : public SharedAllocationRecordCommon<HIPManagedSpace> { - private: - friend class SharedAllocationRecordCommon<HIPManagedSpace>; - using base_t = SharedAllocationRecordCommon<HIPManagedSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPManagedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPManagedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); #endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index 4035bb012132798ed511eaccac823be0be218114..feee44ccaf17abbb62eb14ac3b792897c8520ceb 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -100,7 +100,7 @@ template <class FunctorType> __device__ inline bool hip_inter_block_shuffle_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, FunctorType const& reducer, - HIP::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, HIP::size_type* const m_scratch_flags, int const max_active_thread = blockDim.y) { @@ -115,9 +115,8 @@ __device__ inline bool hip_inter_block_shuffle_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = - reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x; - *global = value; + pointer_type global = m_scratch_space + blockIdx.x; + *global = value; __threadfence(); } @@ -140,8 +139,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction( last_block = true; value = neutral; - pointer_type const global = - reinterpret_cast<pointer_type>(m_scratch_space); + pointer_type const global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < warp_size diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 7f6aa0d8e82d7e392d97119b5ab1cac65db2916e..47f07b31abfe824896aba05cb68a98946b6c354a 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -24,10 +24,8 @@ #include <HIP/Kokkos_HIP_Space.hpp> #include <HIP/Kokkos_HIP_DeepCopy.hpp> -#include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp> #include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_MemorySpace.hpp> #include <impl/Kokkos_DeviceManagement.hpp> #include <impl/Kokkos_ExecSpaceManager.hpp> @@ -41,6 +39,7 @@ /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ + namespace { static std::atomic<bool> is_first_hip_managed_allocation(true); @@ -52,39 +51,63 @@ static std::atomic<bool> is_first_hip_managed_allocation(true); namespace Kokkos { -HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} +HIPSpace::HIPSpace() + : m_device(HIP().hip_device()), m_stream(HIP().hip_stream()) {} HIPHostPinnedSpace::HIPHostPinnedSpace() {} HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {} +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +void* HIPSpace::allocate(const HIP& exec_space, + const size_t arg_alloc_size) const { + return allocate(exec_space, "[unlabeled]", arg_alloc_size); +} + +void* HIPSpace::allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(exec_space.hip_stream(), arg_label, arg_alloc_size, + arg_logical_size, true); +} +#endif + void* HIPSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } -void* HIPSpace::allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +void* HIPSpace::allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(m_stream, arg_label, arg_alloc_size, arg_logical_size, + false); } -void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { +void* HIPSpace::impl_allocate( + [[maybe_unused]] const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, const size_t arg_logical_size, + [[maybe_unused]] const bool stream_sync_only) const { void* ptr = nullptr; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + auto const error_code = hipMallocAsync(&ptr, arg_alloc_size, stream); + if (stream_sync_only) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } else { + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); + } +#else auto const error_code = hipMalloc(&ptr, arg_alloc_size); +#endif + if (error_code != hipSuccess) { // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here (void)hipGetLastError(); - throw Experimental::HIPRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - HIPMalloc); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name()); const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -113,10 +136,7 @@ void* HIPHostPinnedSpace::impl_allocate( // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here (void)hipGetLastError(); - throw Experimental::HIPRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - HIPHostMalloc); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -180,10 +200,7 @@ Kokkos::HIP::runtime WARNING: Kokkos did not find an environment variable 'HSA_X // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here (void)hipGetLastError(); - throw Experimental::HIPRawMemoryAllocationFailure( - arg_alloc_size, error_code, - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - HIPMallocManaged); + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } KOKKOS_IMPL_HIP_SAFE_CALL(hipMemAdvise( ptr, arg_alloc_size, hipMemAdviseSetCoarseGrain, m_device)); @@ -230,7 +247,12 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + KOKKOS_IMPL_HIP_SAFE_CALL(hipFreeAsync(arg_alloc_ptr, m_stream)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); +#else KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +#endif } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, @@ -287,22 +309,3 @@ void HIPManagedSpace::impl_deallocate( } } // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include <impl/Kokkos_SharedAlloc_timpl.hpp> - -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon<HIPSpace>; -template class SharedAllocationRecordCommon<HIPSpace>; -template class SharedAllocationRecordCommon<HIPHostPinnedSpace>; -template class SharedAllocationRecordCommon<HIPManagedSpace>; - -} // end namespace Impl -} // end namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e5cbf1f0e1ac83709de0f0478ab5fb2..2380772cacf853ef091785294ce8fb556fb6405b 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -58,13 +58,30 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; HIPSpace& operator=(const HIPSpace& rhs) = default; ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } +#else + void* allocate(const HIP& exec_space, const size_t arg_alloc_size) const; + void* allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; +#endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -76,12 +93,10 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - template <class, class, class, class> - friend class LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; + void* impl_allocate(const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size, + bool stream_sync_only) const; void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -94,8 +109,7 @@ class HIPSpace { private: int m_device; ///< Which HIP device - - friend class Kokkos::Impl::SharedAllocationRecord<HIPSpace, void>; + hipStream_t m_stream; }; template <> @@ -122,13 +136,23 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; ~HIPHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -140,8 +164,6 @@ class HIPHostPinnedSpace { const size_t arg_logical_size = 0) const; private: - template <class, class, class, class> - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -187,13 +209,23 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(const HIPManagedSpace& rhs) = default; + HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; ~HIPManagedSpace() = default; /**\brief Allocate untracked memory in the space */ + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -209,8 +241,6 @@ class HIPManagedSpace { private: int m_device; ///< Which HIP device - template <class, class, class, class> - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -239,15 +269,18 @@ struct Impl::is_hip_type_space<HIPManagedSpace> : public std::true_type {}; namespace Kokkos { namespace Impl { -static_assert(Kokkos::Impl::MemorySpaceAccess<HIPSpace, HIPSpace>::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess<HIPSpace, HIPSpace>::assignable); //---------------------------------------- template <> struct MemorySpaceAccess<HostSpace, HIPSpace> { enum : bool { assignable = false }; - enum : bool { accessible = false }; +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + enum : bool{accessible = false}; +#else + enum : bool { accessible = true }; +#endif enum : bool { deepcopy = true }; }; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fb466d8a721f9a343fb9083a289a80e71f5ab2e8..1724b4361db8dc94404756235df17909b9d12faa 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -183,7 +183,7 @@ class HIPTeamMember { typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy<HIP>, ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); - hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value; #else (void)reducer; @@ -191,6 +191,19 @@ class HIPTeamMember { #endif } + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<WrappedReducerType>::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); +#else + (void)wrapped_reducer; + (void)value; +#endif + } + //-------------------------------------------------------------------------- /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. @@ -261,17 +274,37 @@ class HIPTeamMember { KOKKOS_INLINE_FUNCTION static std::enable_if_t<is_reducer<ReducerType>::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<HIP>, ReducerType, + value_type>::Reducer; + + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; +#else + (void)reducer; + (void)value; +#endif + } + + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer<WrappedReducerType>::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { #ifdef __HIP_DEVICE_COMPILE__ if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = blockDim.x; (i >>= 1);) { in_place_shfl_down(tmp2, tmp, i, blockDim.x); if (static_cast<int>(threadIdx.x) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -281,10 +314,9 @@ class HIPTeamMember { // and thus different threads could have different results. in_place_shfl(tmp2, tmp, 0, blockDim.x); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; #else - (void)reducer; + (void)wrapped_reducer; (void)value; #endif } @@ -479,15 +511,26 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HIPTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -508,24 +551,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum<ValueType> reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HIPTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; - i += blockDim.y) { - closure(i, val); - } + for (iType i = loop_boundaries.start + threadIdx.y; + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } /** \brief Inter-thread parallel exclusive prefix sum. @@ -620,16 +663,26 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HIPTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; i < loop_boundaries.end; i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -642,25 +695,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum<ValueType> reducer(val); - - reducer.init(reducer.reference()); - - for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; i += blockDim.y * blockDim.x) { - closure(i, val); - } - - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HIPTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- @@ -706,14 +761,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HIPTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; i += blockDim.x) { - closure(i, reducer.reference()); + closure(i, value); } - Impl::HIPTeamMember::vector_reduce(reducer); + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -737,20 +804,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!is_reducer<ValueType>::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - result = ValueType(); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HIPTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; - i += blockDim.x) { - closure(i, result); - } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - Impl::HIPTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result)); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f21c65f16dd8745b54e93bb42a6718157123fa78 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -0,0 +1,422 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP +#define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP + +#include <Kokkos_MinMax.hpp> + +namespace Kokkos { +namespace Impl { + +template <typename... Properties> +class TeamPolicyInternal<HIP, Properties...> + : public PolicyTraits<Properties...> { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + template <typename ExecSpace, typename... OtherProperties> + friend class TeamPolicyInternal; + + private: + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = HIP; + + template <class... OtherProperties> + TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template <typename FunctorType> + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; + + return internal_team_size_common<BlockType::Max, closure_type, void>(f); + } + + template <class FunctorType> + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType, void>; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer<FunctorType, + typename functor_analysis_type::Reducer>, + TeamPolicy<Properties...>, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Max, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template <typename FunctorType, typename ReducerType> + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>, + TeamPolicy<Properties...>, Kokkos::HIP>; + return internal_team_size_common<BlockType::Max, closure_type, + typename ReducerType::value_type>(f); + } + + template <typename FunctorType> + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; + + return internal_team_size_common<BlockType::Preferred, closure_type, void>( + f); + } + + template <typename FunctorType> + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + using functor_analysis_type = + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType, void>; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer<FunctorType, + typename functor_analysis_type::Reducer>, + TeamPolicy<Properties...>, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Preferred, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template <typename FunctorType, typename ReducerType> + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + using closure_type = + Impl::ParallelReduce<CombinedFunctorReducer<FunctorType, ReducerType>, + TeamPolicy<Properties...>, Kokkos::HIP>; + return internal_team_size_common<BlockType::Preferred, closure_type, + typename ReducerType::value_type>(f); + } + + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { return HIPTraits::WarpSize; } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + constexpr int warp_size = HIPTraits::WarpSize; + while (test_pow2 < warp_size) { + test_pow2 <<= 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + inline static int scratch_size_max(int level) { + // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team + // reductions. They also use one int64_t in static shared memory for a + // shared ID. Furthermore, they use additional scratch memory in some + // reduction scenarios, which depend on the size of the value_type and is + // NOT captured here + constexpr size_t max_possible_team_size = 1024; + constexpr size_t max_reserved_shared_mem_per_team = + (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); + // arbitrarily setting level 1 scratch limit to 20MB, for a + // MI250 that would give us about 4.4GB for 2 teams per CU + constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; + + size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; + return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team + : max_l1_scratch_size); + } + + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + size_t scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + size_t team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + + size_t thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + const int max_grid_size_x = m_space.hip_device_prop().maxGridSize[0]; + if (league_size_ >= max_grid_size_x) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on HIP execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::HIPTeamMember; + + protected: + template <BlockType BlockSize, class ClosureType, class ValueType, + class FunctorType> + int internal_team_size_common(FunctorType const& f) const { + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); + using Tag = typename PatternTagFromImplSpecialization<ClosureType>::type; + if constexpr (std::is_same_v<Tag, ParallelReduceTag>) { + using Interface = + typename Impl::DeduceFunctorPatternInterface<ClosureType>::type; + using Analysis = + Impl::FunctorAnalysis<Interface, typename ClosureType::Policy, + FunctorType, ValueType>; + shmem_thread += + ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); + } + const int vector_length = impl_vector_length(); + + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + if constexpr (BlockSize == BlockType::Max) { + block_size = hip_get_max_team_blocksize<ClosureType, + typename traits::launch_bounds>( + space().impl_internal_space_instance(), functor); + } else { + block_size = + hip_get_preferred_team_blocksize<ClosureType, + typename traits::launch_bounds>( + space().impl_internal_space_instance(), functor); + } + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " + "team size.")); + } + if constexpr (std::is_same_v<Tag, ParallelForTag>) { + return block_size / impl_vector_length(); + } else { + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + } +}; + +__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, + int32_t* scratch_locks, + size_t num_scratch_locks) { + int64_t threadid = 0; + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + int64_t const wraparound_len = + Kokkos::min(int64_t(league_size), + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + wraparound_len * blockDim.x * blockDim.y) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + return threadid; +} + +__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, + int64_t threadid) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + scratch_locks[threadid] = 0; + } +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index 313e5f5217296f152171e4bffe418b60c4a413c3..3d70b596463586f3c30c409d06893d58ac1003e3 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -19,7 +19,6 @@ #include <HIP/Kokkos_HIP_Space.hpp> #include <Kokkos_UniqueToken.hpp> -#include <impl/Kokkos_SharedAlloc.hpp> namespace Kokkos { diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 30774c898b679e463fe5aff997186f3ffc6f9bc8..f5b1d321e8cfe81424ec0cc82d96b66f6c1c1860 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -40,8 +40,8 @@ struct in_place_shfl_op { template <class Scalar> // requires _assignable_from_bits<Scalar> __device__ inline std::enable_if_t<sizeof(Scalar) < sizeof(int)> operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { using shfl_type = int; union conv_type { Scalar orig; @@ -65,16 +65,16 @@ struct in_place_shfl_op { template <class Scalar> // requires _assignable_from_bits<Scalar> __device__ inline std::enable_if_t<sizeof(Scalar) == sizeof(int)> operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast<int&>(out) = self().do_shfl_op( reinterpret_cast<int const&>(in), lane_or_delta, width); } template <class Scalar> __device__ inline std::enable_if_t<sizeof(Scalar) == sizeof(double)> - operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast<double&>(out) = self().do_shfl_op( *reinterpret_cast<double const*>(&in), lane_or_delta, width); } @@ -82,8 +82,8 @@ struct in_place_shfl_op { // sizeof(Scalar) > sizeof(double) case template <typename Scalar> __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))> - operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { using shuffle_as_t = int; constexpr int N = sizeof(Scalar) / sizeof(shuffle_as_t); @@ -108,7 +108,7 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> { template <class... Args> __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> { @@ -123,7 +123,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> { template <class... Args> __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> { @@ -138,7 +138,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> { template <class... Args> __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp similarity index 53% rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp rename to packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp index acd2a572c8c216fe1f953e2651533ecc08d99ddb..34d5ecf1a657136a39a5157f6ccfb04e91a71ec1 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp @@ -14,28 +14,23 @@ // //@HEADER -#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP -#define KOKKOS_STD_ALGORITHMS_SWAP_HPP +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif -#include <Kokkos_Core.hpp> +#include <HIP/Kokkos_HIP_ZeroMemset.hpp> +#include <HIP/Kokkos_HIP_ParallelFor_Range.hpp> namespace Kokkos { -namespace Experimental { - -// swap -template <class T> -KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept { - static_assert( - std::is_move_assignable<T>::value && std::is_move_constructible<T>::value, - "Kokkos::Experimental::swap arguments must be move assignable " - "and move constructible"); +namespace Impl { - T tmp = std::move(a); - a = std::move(b); - b = std::move(tmp); +// alternative to hipMemsetAsync, which sets the first `cnt` bytes of `dst` to 0 +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt) { + Kokkos::parallel_for( + "Kokkos::ZeroMemset via parallel_for", + Kokkos::RangePolicy<Kokkos::HIP>(exec_space, 0, cnt), + KOKKOS_LAMBDA(size_t i) { static_cast<char*>(dst)[i] = 0; }); } -} // namespace Experimental +} // namespace Impl } // namespace Kokkos - -#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 5c40d0fbc8d079d7d64db4f0799624d8ebdc4b4d..18708cf8c5669d7e177931068076186929d963d7 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -23,20 +23,21 @@ namespace Kokkos { namespace Impl { -template <class T, class... P> -struct ZeroMemset<HIP, View<T, P...>> { - ZeroMemset(const HIP& exec_space, const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type), - exec_space.hip_stream())); - } +// hipMemsetAsync sets the first `cnt` bytes of `dst` to the provided value +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt); - ZeroMemset(const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { +template <> +struct ZeroMemset<HIP> { + ZeroMemset(const HIP& exec_space, void* dst, size_t cnt) { + // in ROCm <= 6.2.0, hipMemsetAsync on a host-allocated pointer + // returns an invalid value error, but accessing the data via a + // GPU kernel works. +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + zero_with_hip_kernel(exec_space, dst, cnt); +#else KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(dst.data(), 0, - dst.size() * sizeof(typename View<T, P...>::value_type))); + hipMemsetAsync(dst, 0, cnt, exec_space.hip_stream())); +#endif } }; diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp index 4a40ffcaa4f680c86e7bfd6d832cf7ea5dae9b2a..1f3d0783449fc1585b247d132ac438ea7fe2a139 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -103,6 +103,7 @@ void HPX::print_configuration(std::ostream &os, const bool) const { os << hpx::configuration_string() << '\n'; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 bool &HPX::impl_get_in_parallel() noexcept { static thread_local bool in_parallel = false; return in_parallel; @@ -127,6 +128,7 @@ HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept { KOKKOS_EXPECTS(!impl_get_in_parallel()); impl_get_in_parallel() = true; } +#endif void HPX::impl_decrement_active_parallel_region_count() { std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex); @@ -151,7 +153,7 @@ void HPX::impl_instance_fence_locked(const std::string &name) const { auto &s = impl_get_sender(); hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } @@ -182,7 +184,7 @@ void HPX::impl_static_fence(const std::string &name) { } hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender( + s = hpx::execution::experimental::unique_any_sender<>( hpx::execution::experimental::just()); }); } diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp index 1dfc5b406464192f90563996706b31522a569722..7d49933790878356f400acc0913b31cdecd46e50 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -27,25 +27,15 @@ static_assert(false, #include <Kokkos_Core_fwd.hpp> -#include <Kokkos_HostSpace.hpp> -#include <cstddef> -#include <iosfwd> - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include <Kokkos_HBWSpace.hpp> -#endif - #include <Kokkos_HostSpace.hpp> #include <Kokkos_Layout.hpp> #include <Kokkos_MemoryTraits.hpp> #include <Kokkos_Parallel.hpp> #include <Kokkos_ScratchSpace.hpp> -#include <Kokkos_TaskScheduler.hpp> #include <impl/Kokkos_ConcurrentBitset.hpp> #include <impl/Kokkos_FunctorAnalysis.hpp> #include <impl/Kokkos_HostSharedPtr.hpp> #include <impl/Kokkos_Tools.hpp> -#include <impl/Kokkos_TaskQueue.hpp> #include <impl/Kokkos_InitializationSettings.hpp> #include <KokkosExp_MDRangePolicy.hpp> @@ -59,6 +49,7 @@ static_assert(false, #include <Kokkos_UniqueToken.hpp> +#include <cstddef> #include <iosfwd> #include <functional> #include <memory> @@ -82,12 +73,12 @@ class hpx_thread_buffer { } public: - hpx_thread_buffer() = default; - ~hpx_thread_buffer() = default; - hpx_thread_buffer(const hpx_thread_buffer &) = delete; - hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; - hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; void resize(const std::size_t num_threads, const std::size_t size_per_thread, const std::size_t extra_space = 0) noexcept; @@ -147,10 +138,10 @@ class HPX { hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_id(instance_id), m_sender{std::move(sender)} {} - instance_data(const instance_data &) = delete; - instance_data(instance_data &&) = delete; + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; instance_data &operator=(const instance_data &) = delete; - instance_data &operator=(instance_data) = delete; + instance_data &operator=(instance_data) = delete; uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ @@ -175,21 +166,35 @@ class HPX { : m_instance_data(Kokkos::Impl::HostSharedPtr<instance_data>( &m_default_instance_data, &default_instance_deleter)) {} ~HPX() = default; - HPX(instance_mode mode) + explicit HPX(instance_mode mode) : m_instance_data( mode == instance_mode::independent ? (Kokkos::Impl::HostSharedPtr<instance_data>( new instance_data(m_next_instance_id++))) : Kokkos::Impl::HostSharedPtr<instance_data>( &m_default_instance_data, &default_instance_deleter)) {} - HPX(hpx::execution::experimental::unique_any_sender<> &&sender) + explicit HPX(hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_data(Kokkos::Impl::HostSharedPtr<instance_data>( new instance_data(m_next_instance_id++, std::move(sender)))) {} +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT( + "HPX execution space should be constructed explicitly.") + HPX(instance_mode mode) + : HPX(mode) {} + + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT( + "HPX execution space should be constructed explicitly.") + HPX(hpx::execution::experimental::unique_any_sender<> &&sender) + : HPX(std::move(sender)) {} +#endif + HPX(HPX &&other) = default; HPX(const HPX &other) = default; - HPX &operator=(HPX &&) = default; + HPX &operator=(HPX &&) = default; HPX &operator=(const HPX &) = default; void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; @@ -201,14 +206,15 @@ class HPX { return impl_get_instance_data().m_instance_id; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static bool &impl_get_in_parallel() noexcept; struct impl_in_parallel_scope { impl_in_parallel_scope() noexcept; ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; + impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; }; @@ -223,9 +229,10 @@ class HPX { delete; }; - static bool in_parallel(HPX const & = HPX()) noexcept { + KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept { return impl_get_in_parallel(); } +#endif static void impl_decrement_active_parallel_region_count(); static void impl_increment_active_parallel_region_count(); @@ -240,24 +247,14 @@ class HPX { impl_instance_fence(name); } - static bool is_asynchronous(HPX const & = HPX()) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { #if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; #endif } - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - template <typename F> - KOKKOS_DEPRECATED static void partition_master( - F const &, int requested_num_partitions = 0, int = 0) { - if (requested_num_partitions > 1) { - Kokkos::abort( - "Kokkos::Experimental::HPX::partition_master: can't partition an " - "HPX instance\n"); - } - } #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -284,8 +281,8 @@ class HPX { return impl_get_instance_data().m_buffer; } - hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const - noexcept { + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() + const noexcept { return impl_get_instance_data().m_sender; } @@ -355,7 +352,9 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, n, stacksize); @@ -417,15 +416,21 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.setup(); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.finalize(); }}, n, stacksize); @@ -442,6 +447,20 @@ class HPX { } }; +template <typename... Args> +std::vector<HPX> partition_space(HPX const &, Args... args) { + std::vector<HPX> instances(sizeof...(args)); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + +template <typename T> +std::vector<HPX> partition_space(HPX const &, std::vector<T> const &weights) { + std::vector<HPX> instances(weights.size()); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + extern template void HPX::impl_bulk_plain_erased<int>( bool, bool, std::function<void(int)> &&, int const, hpx::threads::thread_stacksize stacksize) const; @@ -1292,6 +1311,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1299,6 +1319,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast<pointer_type>( @@ -1320,6 +1343,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1327,6 +1351,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast<pointer_type>( @@ -1407,6 +1434,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1414,6 +1442,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast<pointer_type>( @@ -1435,6 +1466,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1442,6 +1474,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast<pointer_type>( @@ -1751,11 +1786,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HPXTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each @@ -1789,14 +1837,26 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HPXTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + wrapped_reducer.final(&value); + result = value; } template <typename iType, class Lambda, typename ReducerType, @@ -1805,11 +1865,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HPXTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template <typename iType, class Lambda, typename ReducerType, @@ -1818,14 +1891,27 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::HPXTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template <typename iType, class FunctorType, typename ValueType> @@ -1974,7 +2060,9 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include <HPX/Kokkos_HPX_Task.hpp> +#endif #endif /* #if defined( KOKKOS_ENABLE_HPX ) */ #endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index 28c75b2515ae45ea4239439ba713468ab5ac2d2b..d775b7fac3b70c0b75ff0c0c9199aefff38c96f8 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -25,6 +25,8 @@ #include <HPX/Kokkos_HPX.hpp> +#include <impl/Kokkos_TaskTeamMember.hpp> + #include <hpx/execution.hpp> #include <hpx/future.hpp> @@ -33,6 +35,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -256,6 +263,10 @@ extern template class TaskQueue< } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index c9080db01cafc799d9da5c36516fddb7f52e861f..92dc506c5e9ddd80851f684872f3ec137b7eab58 100644 --- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -30,6 +30,7 @@ static_assert(false, #include <impl/KokkosExp_Host_IterateTile.hpp> #include <Kokkos_ExecPolicy.hpp> #include <type_traits> +#include <cmath> namespace Kokkos { @@ -60,28 +61,33 @@ namespace Impl { // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template <class T> -constexpr std::enable_if_t<!std::is_signed<T>::value, bool> +constexpr std::enable_if_t<!std::is_signed_v<T>, bool> is_less_than_value_initialized_variable(T) { return false; } template <class T> -constexpr std::enable_if_t<std::is_signed<T>::value, bool> +constexpr std::enable_if_t<std::is_signed_v<T>, bool> is_less_than_value_initialized_variable(T arg) { return arg < T{}; } // Checked narrowing conversion that calls abort if the cast changes the value template <class To, class From> -constexpr To checked_narrow_cast(From arg) { +constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = - (std::is_signed<To>::value != std::is_signed<From>::value); + (std::is_signed_v<To> != std::is_signed_v<From>); auto const ret = static_cast<To>(arg); if (static_cast<From>(ret) != arg || (is_different_signedness && is_less_than_value_initialized_variable(arg) != is_less_than_value_initialized_variable(ret))) { - Kokkos::abort("unsafe narrowing conversion"); + auto msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(arg) + ") in dimension (" + std::to_string(idx) + + "), which may not preserve its original value.\n"; + Kokkos::abort(msg.c_str()); } return ret; } @@ -96,15 +102,15 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { using T = typename Array::value_type; Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); auto* ptr = a.data(); // NOTE equivalent to // std::transform(std::begin(init), std::end(init), a.data(), // [](U x) { return static_cast<T>(x); }); // except that std::transform is not constexpr. - for (auto x : init) { - *ptr++ = checked_narrow_cast<T>(x); - (void)checked_narrow_cast<IndexType>(x); // see note above + for (std::size_t i = 0; i < M; ++i) { + *ptr++ = checked_narrow_cast<T>(init[i], i); + (void)checked_narrow_cast<IndexType>(init[i], i); // see note above } return a; } @@ -120,10 +126,10 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; NVCC_WONT_LET_ME_CALL_YOU_Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { - a[i] = checked_narrow_cast<T>(other[i]); - (void)checked_narrow_cast<IndexType>(other[i]); // see note above + a[i] = checked_narrow_cast<T>(other[i], i); + (void)checked_narrow_cast<IndexType>(other[i], i); // see note above } return a; } @@ -150,9 +156,20 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { // multi-dimensional iteration pattern template <typename... Properties> -struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { - using traits = Kokkos::Impl::PolicyTraits<Properties...>; - using range_policy = RangePolicy<Properties...>; +struct MDRangePolicy; + +// Note: If MDRangePolicy has a primary template, implicit CTAD (deduction +// guides) are generated -> MDRangePolicy<> by some compilers, which is +// incorrect. By making it a template specialization instead, no implicit CTAD +// is generated. This works because there has to be at least one property +// specified (which is Rank<...>); otherwise, we'd get the static_assert +// "Kokkos::Error: MD iteration pattern not defined". This template +// specialization uses <P, Properties...> in all places for correctness. +template <typename P, typename... Properties> +struct MDRangePolicy<P, Properties...> + : public Kokkos::Impl::PolicyTraits<P, Properties...> { + using traits = Kokkos::Impl::PolicyTraits<P, Properties...>; + using range_policy = RangePolicy<P, Properties...>; typename traits::execution_space m_space; @@ -161,13 +178,13 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { typename traits::schedule_type, typename traits::index_type>; using execution_policy = - MDRangePolicy<Properties...>; // needed for is_execution_space - // interrogation + MDRangePolicy<P, Properties...>; // needed for is_execution_policy + // interrogation template <class... OtherProperties> friend struct MDRangePolicy; - static_assert(!std::is_void<typename traits::iteration_pattern>::value, + static_assert(!std::is_void_v<typename traits::iteration_pattern>, "Kokkos Error: MD iteration pattern not defined"); using iteration_pattern = typename traits::iteration_pattern; @@ -222,9 +239,9 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { template <typename LT, std::size_t LN, typename UT, std::size_t UN, typename TT = array_index_type, std::size_t TN = rank, - typename = std::enable_if_t<std::is_integral<LT>::value && - std::is_integral<UT>::value && - std::is_integral<TT>::value>> + typename = std::enable_if_t<std::is_integral_v<LT> && + std::is_integral_v<UT> && + std::is_integral_v<TT>>> MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) : MDRangePolicy( @@ -241,9 +258,9 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { template <typename LT, std::size_t LN, typename UT, std::size_t UN, typename TT = array_index_type, std::size_t TN = rank, - typename = std::enable_if_t<std::is_integral<LT>::value && - std::is_integral<UT>::value && - std::is_integral<TT>::value>> + typename = std::enable_if_t<std::is_integral_v<LT> && + std::is_integral_v<UT> && + std::is_integral_v<TT>>> MDRangePolicy(const typename traits::execution_space& work_space, const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) @@ -275,14 +292,14 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { } template <typename T, std::size_t NT = rank, - typename = std::enable_if_t<std::is_integral<T>::value>> + typename = std::enable_if_t<std::is_integral_v<T>>> MDRangePolicy(Kokkos::Array<T, rank> const& lower, Kokkos::Array<T, rank> const& upper, Kokkos::Array<T, NT> const& tile = Kokkos::Array<T, NT>{}) : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} template <typename T, std::size_t NT = rank, - typename = std::enable_if_t<std::is_integral<T>::value>> + typename = std::enable_if_t<std::is_integral_v<T>>> MDRangePolicy(const typename traits::execution_space& work_space, Kokkos::Array<T, rank> const& lower, Kokkos::Array<T, rank> const& upper, @@ -314,7 +331,44 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { } bool impl_tune_tile_size() const { return m_tune_tile_size; } + tile_type tile_size_recommended() const { + tile_type rec_tile_sizes = {}; + + for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { + rec_tile_sizes[i] = tile_size_recommended(i); + } + return rec_tile_sizes; + } + + int max_total_tile_size() const { + return Impl::get_tile_size_properties(m_space).max_total_tile_size; + } + private: + int tile_size_recommended(const int tile_rank) const { + auto properties = Impl::get_tile_size_properties(m_space); + int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int rank_acc = + (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; + int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < + properties.max_total_tile_size) + ? properties.default_tile_size + : 1; + + if (tile_rank == last_rank) { + rec_tile_size = tile_size_last_rank( + properties, m_upper[last_rank] - m_lower[last_rank]); + } + return rec_tile_size; + } + + int tile_size_last_rank(const Impl::TileSizeProperties properties, + const index_type length) const { + return properties.default_largest_tile_size == 0 + ? std::max<int>(length, 1) + : properties.default_largest_tile_size; + } + void init_helper(Impl::TileSizeProperties properties) { m_prod_tile_dims = 1; int increment = 1; @@ -325,8 +379,23 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { rank_start = rank - 1; rank_end = -1; } + for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; + + if (m_upper[i] < m_lower[i]) { + std::string msg = + "Kokkos::MDRangePolicy bounds error: The lower bound (" + + std::to_string(m_lower[i]) + ") is greater than its upper bound (" + + std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + + ".\n"; +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + Kokkos::abort(msg.c_str()); +#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + Kokkos::Impl::log_warning(msg); +#endif + } + if (m_tile[i] <= 0) { m_tune_tile_size = true; if ((inner_direction == Iterate::Right && (i < rank - 1)) || @@ -338,9 +407,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { m_tile[i] = 1; } } else { - m_tile[i] = properties.default_largest_tile_size == 0 - ? std::max<int>(length, 1) - : properties.default_largest_tile_size; + m_tile[i] = tile_size_last_rank(properties, length); } } m_tile_end[i] = @@ -358,6 +425,57 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { } }; +template <typename LT, size_t N, typename UT> +MDRangePolicy(const LT (&)[N], const UT (&)[N]) -> MDRangePolicy<Rank<N>>; + +template <typename LT, size_t N, typename UT, typename TT, size_t TN> +MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + -> MDRangePolicy<Rank<N>>; + +template <typename LT, size_t N, typename UT> +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) + -> MDRangePolicy<Rank<N>>; + +template <typename LT, size_t N, typename UT, typename TT, size_t TN> +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], + const TT (&)[TN]) -> MDRangePolicy<Rank<N>>; + +template <typename ES, typename LT, size_t N, typename UT, + typename = std::enable_if_t<is_execution_space_v<ES>>> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) + -> MDRangePolicy<ES, Rank<N>>; + +template <typename ES, typename LT, size_t N, typename UT, typename TT, + size_t TN, typename = std::enable_if_t<is_execution_space_v<ES>>> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + -> MDRangePolicy<ES, Rank<N>>; + +template <typename T, size_t N> +MDRangePolicy(Array<T, N> const&, Array<T, N> const&) -> MDRangePolicy<Rank<N>>; + +template <typename T, size_t N, size_t NT> +MDRangePolicy(Array<T, N> const&, Array<T, N> const&, Array<T, NT> const&) + -> MDRangePolicy<Rank<N>>; + +template <typename T, size_t N> +MDRangePolicy(DefaultExecutionSpace const&, Array<T, N> const&, + Array<T, N> const&) -> MDRangePolicy<Rank<N>>; + +template <typename T, size_t N, size_t NT> +MDRangePolicy(DefaultExecutionSpace const&, Array<T, N> const&, + Array<T, N> const&, Array<T, NT> const&) + -> MDRangePolicy<Rank<N>>; + +template <typename ES, typename T, size_t N, + typename = std::enable_if_t<is_execution_space_v<ES>>> +MDRangePolicy(ES const&, Array<T, N> const&, Array<T, N> const&) + -> MDRangePolicy<ES, Rank<N>>; + +template <typename ES, typename T, size_t N, size_t NT, + typename = std::enable_if_t<is_execution_space_v<ES>>> +MDRangePolicy(ES const&, Array<T, N> const&, Array<T, N> const&, + Array<T, NT> const&) -> MDRangePolicy<ES, Rank<N>>; + } // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 9f5deed5d66f974fefee183748f36289331c06e7..62f527aa025c10c6cc66624e8ee824a35039be4a 100644 --- a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -41,10 +41,10 @@ class AnonymousSpace { using device_type = Kokkos::Device<execution_space, memory_space>; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; AnonymousSpace &operator=(const AnonymousSpace &) = default; ~AnonymousSpace() = default; diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp index 82ceaaec21833dc0a320aa4e7e1c3d64261008a2..493536b53bed8889d44f6f0feb0553fdf0e4744d 100644 --- a/packages/kokkos/core/src/Kokkos_Array.hpp +++ b/packages/kokkos/core/src/Kokkos_Array.hpp @@ -22,20 +22,20 @@ #endif #include <Kokkos_Macros.hpp> +#include <Kokkos_Swap.hpp> #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_StringManipulation.hpp> #include <type_traits> #include <algorithm> #include <utility> -#include <limits> #include <cstddef> namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template <typename Integral, bool Signed = std::is_signed<Integral>::value> +template <typename Integral, bool Signed = std::is_signed_v<Integral>> struct ArrayBoundsCheck; template <typename Integral> @@ -79,7 +79,11 @@ struct ArrayBoundsCheck<Integral, false> { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template <class T = void, size_t N = KOKKOS_INVALID_INDEX, class Proxy = void> +#else +template <class T, size_t N> +#endif struct Array { public: /** @@ -128,10 +132,38 @@ struct Array { KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { return &m_internal_implementation_private_member_data[0]; } + + friend KOKKOS_FUNCTION constexpr bool operator==(Array const& lhs, + Array const& rhs) noexcept { + for (size_t i = 0; i != N; ++i) + if (lhs[i] != rhs[i]) return false; + return true; + } + + friend KOKKOS_FUNCTION constexpr bool operator!=(Array const& lhs, + Array const& rhs) noexcept { + return !(lhs == rhs); + } + + private: + template <class U = T> + friend KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< + Impl::is_swappable<U>::value> + kokkos_swap(Array<T, N>& a, + Array<T, N>& b) noexcept(Impl::is_nothrow_swappable_v<U>) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } + } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 template <class T, class Proxy> struct Array<T, 0, Proxy> { +#else +template <class T> +struct Array<T, 0> { +#endif public: using reference = T&; using const_reference = std::add_const_t<T>&; @@ -163,28 +195,40 @@ struct Array<T, 0, Proxy> { return *reinterpret_cast<const_pointer>(-1); } - KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr pointer data() { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { + return nullptr; + } - KOKKOS_DEFAULTED_FUNCTION ~Array() = default; - KOKKOS_DEFAULTED_FUNCTION Array() = default; - KOKKOS_DEFAULTED_FUNCTION Array(const Array&) = default; - KOKKOS_DEFAULTED_FUNCTION Array& operator=(const Array&) = default; + friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, + Array const&) noexcept { + return true; + } + friend KOKKOS_FUNCTION constexpr bool operator!=(Array const&, + Array const&) noexcept { + return false; + } - // Some supported compilers are not sufficiently C++11 compliant - // for default move constructor and move assignment operator. - // Array( Array && ) = default ; - // Array & operator = ( Array && ) = default ; + private: + friend KOKKOS_INLINE_FUNCTION constexpr void kokkos_swap( + Array<T, 0>&, Array<T, 0>&) noexcept {} }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +struct KokkosArrayContiguous {}; +struct KokkosArrayStrided {}; +} // namespace Impl + template <> -struct Array<void, KOKKOS_INVALID_INDEX, void> { - struct contiguous {}; - struct strided {}; +struct KOKKOS_DEPRECATED Array<void, KOKKOS_INVALID_INDEX, void> { + using contiguous = Impl::KokkosArrayContiguous; + using strided = Impl::KokkosArrayStrided; }; template <class T> -struct Array<T, KOKKOS_INVALID_INDEX, Array<>::contiguous> { +struct KOKKOS_DEPRECATED + Array<T, KOKKOS_INVALID_INDEX, Impl::KokkosArrayContiguous> { private: T* m_elem; size_t m_size; @@ -252,7 +296,8 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::contiguous> { }; template <class T> -struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> { +struct KOKKOS_DEPRECATED + Array<T, KOKKOS_INVALID_INDEX, Impl::KokkosArrayStrided> { private: T* m_elem; size_t m_size; @@ -319,6 +364,36 @@ struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> { size_type arg_stride) : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +#endif + +template <typename T, typename... Us> +Array(T, Us...) -> Array<T, 1 + sizeof...(Us)>; + +namespace Impl { + +template <typename T, size_t N, size_t... I> +KOKKOS_FUNCTION constexpr Array<std::remove_cv_t<T>, N> to_array_impl( + T (&a)[N], std::index_sequence<I...>) { + return {{a[I]...}}; +} + +template <typename T, size_t N, size_t... I> +KOKKOS_FUNCTION constexpr Array<std::remove_cv_t<T>, N> to_array_impl( + T (&&a)[N], std::index_sequence<I...>) { + return {{std::move(a[I])...}}; +} + +} // namespace Impl + +template <typename T, size_t N> +KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { + return Impl::to_array_impl(a, std::make_index_sequence<N>{}); +} + +template <typename T, size_t N> +KOKKOS_FUNCTION constexpr auto to_array(T (&&a)[N]) { + return Impl::to_array_impl(std::move(a), std::make_index_sequence<N>{}); +} } // namespace Kokkos @@ -329,6 +404,7 @@ struct std::tuple_size<Kokkos::Array<T, N>> template <std::size_t I, class T, std::size_t N> struct std::tuple_element<I, Kokkos::Array<T, N>> { + static_assert(I < N); using type = T; }; @@ -336,27 +412,57 @@ namespace Kokkos { template <std::size_t I, class T, std::size_t N> KOKKOS_FUNCTION constexpr T& get(Array<T, N>& a) noexcept { + static_assert(I < N); return a[I]; } template <std::size_t I, class T, std::size_t N> KOKKOS_FUNCTION constexpr T const& get(Array<T, N> const& a) noexcept { + static_assert(I < N); return a[I]; } template <std::size_t I, class T, std::size_t N> KOKKOS_FUNCTION constexpr T&& get(Array<T, N>&& a) noexcept { + static_assert(I < N); return std::move(a[I]); } template <std::size_t I, class T, std::size_t N> KOKKOS_FUNCTION constexpr T const&& get(Array<T, N> const&& a) noexcept { + static_assert(I < N); return std::move(a[I]); } } // namespace Kokkos //</editor-fold> +//<editor-fold desc="Support for range-based for loop"> +namespace Kokkos { + +template <class T, std::size_t N> +KOKKOS_FUNCTION constexpr T const* begin(Array<T, N> const& a) noexcept { + return a.data(); +} + +template <class T, std::size_t N> +KOKKOS_FUNCTION constexpr T* begin(Array<T, N>& a) noexcept { + return a.data(); +} + +template <class T, std::size_t N> +KOKKOS_FUNCTION constexpr T const* end(Array<T, N> const& a) noexcept { + return a.data() + a.size(); +} + +template <class T, std::size_t N> +KOKKOS_FUNCTION constexpr T* end(Array<T, N>& a) noexcept { + return a.data() + a.size(); +} + +} // namespace Kokkos +//</editor-fold> + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY diff --git a/packages/kokkos/core/src/Kokkos_Assert.hpp b/packages/kokkos/core/src/Kokkos_Assert.hpp index c3b9004734a16f4cea233b0497df108e43e213e8..6fea286005e0ffa889796a3c6b706523db1e5397 100644 --- a/packages/kokkos/core/src/Kokkos_Assert.hpp +++ b/packages/kokkos/core/src/Kokkos_Assert.hpp @@ -44,9 +44,6 @@ __LINE__) " \n"); \ } \ } -// some projects already define this for themselves, so don't mess -// them up -#ifndef KOKKOS_ASSERT #define KOKKOS_ASSERT(...) \ { \ if (!bool(__VA_ARGS__)) { \ @@ -58,8 +55,7 @@ __LINE__) " \n"); \ } \ } -#endif // ifndef KOKKOS_ASSERT -#else // not debug mode +#else // not debug mode #define KOKKOS_EXPECTS(...) #define KOKKOS_ENSURES(...) #ifndef KOKKOS_ASSERT diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp index 6fc903f2743454174813317eb2160a9163398d8b..ba6113609229a4161d3658b7047402273596c3f5 100644 --- a/packages/kokkos/core/src/Kokkos_Atomic.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp @@ -47,7 +47,6 @@ #include <Kokkos_Macros.hpp> #include <Kokkos_Atomics_Desul_Wrapper.hpp> -#include <Kokkos_Atomics_Desul_Volatile_Wrapper.hpp> #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp deleted file mode 100644 index 1c4347463219d95f07cc68deccf46890e912e7f1..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ /dev/null @@ -1,197 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include <Kokkos_Macros.hpp> -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#include <Kokkos_Macros.hpp> -#include <Kokkos_Atomics_Desul_Config.hpp> -#include <desul/atomics.hpp> - -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() -#else -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() -#endif - -// clang-format off -namespace Kokkos { - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast<T*>(dest), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_or (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) { - return desul::atomic_compare_exchange_strong(const_cast<T*>(dest),expected, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) { - return desul::atomic_compare_exchange(const_cast<T*>(dest),compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -} -#undef KOKKOS_DESUL_MEM_SCOPE - -// clang-format on -#endif diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index bda37839805c75e0ee58f4f926f52b7083d339a4..40f51c5a33406d103036fe85fb04190d3ba95e78 100644 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -22,18 +22,18 @@ static_assert(false, #ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_ #define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_ #include <Kokkos_Macros.hpp> - -#include <Kokkos_Atomics_Desul_Config.hpp> #include <desul/atomics.hpp> +#include <impl/Kokkos_Utilities.hpp> // identity_type #include <impl/Kokkos_Volatile_Load.hpp> -// clang-format off namespace Kokkos { -// FIXME: These functions don't have any use/test in unit tests ... -// ========================================================== -inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline const char* atomic_query_version() { + return "KOKKOS_DESUL_ATOMICS"; +} +#endif #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -49,203 +49,126 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #endif // ============================================================ -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() #endif -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { atomic_store(dest,val); } +namespace Impl { +template <class T> +using not_deduced_atomic_t = + std::add_const_t<std::remove_volatile_t<type_identity_t<T>>>; + +template <class T, class R> +using enable_if_atomic_t = + std::enable_if_t<!std::is_reference_v<T> && !std::is_const_v<T>, + std::remove_volatile_t<R>>; +} // namespace Impl -KOKKOS_INLINE_FUNCTION -void memory_fence() { - desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); -} +// clang-format off -KOKKOS_INLINE_FUNCTION -void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +// fences +KOKKOS_INLINE_FUNCTION void memory_fence() { desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void load_fence() { desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } -KOKKOS_INLINE_FUNCTION -void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } +// load/store +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_load (T const* ptr) { return desul::atomic_load (const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_store(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_store(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template<class T> KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_assign(T* ptr, Impl::not_deduced_atomic_t<T> val) { atomic_store(ptr, val); } +#endif // atomic_fetch_op -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_add(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_sub(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_sub(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_max(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_max(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_min(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_min(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_mul(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_mul(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_div(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_div(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_mod(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_mod(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_and(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_and(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_or (T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_or (const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_xor(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_xor(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_nand(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_nand(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_lshift(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_lshift(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_rshift(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_fetch_rshift(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_inc(T* ptr) { return desul::atomic_fetch_inc(const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_fetch_dec(T* ptr) { return desul::atomic_fetch_dec(const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op_fetch -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_add_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_add_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_sub_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_sub_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_max_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_max_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_min_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_min_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_mul_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_mul_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_div_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_div_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_mod_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_mod_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_and_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_and_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_or_fetch (T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_or_fetch (const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_xor_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_xor_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_nand_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_nand_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_lshift_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_lshift_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_rshift_fetch(T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_rshift_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_inc_fetch(T* ptr) { return desul::atomic_inc_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_dec_fetch(T* ptr) { return desul::atomic_dec_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } // atomic_op -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_add(T* ptr, Impl::not_deduced_atomic_t<T> val) { desul::atomic_add(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_sub(T* ptr, Impl::not_deduced_atomic_t<T> val) { desul::atomic_sub(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_max(T* ptr, Impl::not_deduced_atomic_t<T> val) { desul::atomic_max(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_min(T* ptr, Impl::not_deduced_atomic_t<T> val) { desul::atomic_min(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_mul(T* ptr, Impl::not_deduced_atomic_t<T> val) { desul::atomic_mul(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_div(T* ptr, Impl::not_deduced_atomic_t<T> val) { desul::atomic_div(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_mod(T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_fetch_mod(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_and(T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_fetch_and(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_or (T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_fetch_or (const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_xor(T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_fetch_xor(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_nand(T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_nand_fetch(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_lshift(T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_fetch_lshift(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_rshift(T* ptr, Impl::not_deduced_atomic_t<T> val) { (void)desul::atomic_fetch_rshift(const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_inc(T* ptr) { desul::atomic_inc(const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_dec(T* ptr) { desul::atomic_dec(const_cast<std::remove_volatile_t<T>*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template<class T> KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_increment(T* ptr) { atomic_inc(ptr); } +template<class T> KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, void> atomic_decrement(T* ptr) { atomic_dec(ptr); } +#endif -template<class T> KOKKOS_INLINE_FUNCTION -void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +// exchange +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_exchange (T* ptr, Impl::not_deduced_atomic_t<T> val) { return desul::atomic_exchange (const_cast<std::remove_volatile_t<T>*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template<class T> KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, T> atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t<T> expected, Impl::not_deduced_atomic_t<T> desired) { return desul::atomic_compare_exchange(const_cast<std::remove_volatile_t<T>*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template<class T> KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t<T, bool> atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t<T> expected, Impl::not_deduced_atomic_t<T> desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } +#endif -// Exchange +// clang-format on +} // namespace Kokkos -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +namespace Kokkos::Impl { -template<class T> KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> expected, desul::Impl::dont_deduce_this_parameter_t<const T> desired) { - T expected_ref = expected; - return desul::atomic_compare_exchange_strong(dest, expected_ref, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template <class T, class MemOrderSuccess, class MemOrderFailure> +KOKKOS_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, + const T desired, + MemOrderSuccess succ, + MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, + fail, KOKKOS_DESUL_MEM_SCOPE); } -template<class T> KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> compare, desul::Impl::dont_deduce_this_parameter_t<const T> desired) { - return desul::atomic_compare_exchange(dest, compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); +template <class T, class MemoryOrder> +KOKKOS_FUNCTION T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); } -namespace Impl { - template<class T, class MemOrderSuccess, class MemOrderFailure> KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); - } - template<class T, class MemoryOrder> - KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder order) { - return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); - } - template<class T, class MemoryOrder> - KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder order) { - return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); - } -} // namespace Impl +template <class T, class MemoryOrder> +KOKKOS_FUNCTION void atomic_store(T* const src, const T val, + MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); +} -} // namespace Kokkos +} // namespace Kokkos::Impl #undef KOKKOS_DESUL_MEM_SCOPE -// clang-format on #endif diff --git a/packages/kokkos/core/src/Kokkos_Clamp.hpp b/packages/kokkos/core/src/Kokkos_Clamp.hpp new file mode 100644 index 0000000000000000000000000000000000000000..033cde9ab848b3fe01543db687d5155d7ef97fee --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Clamp.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_CLAMP_HPP +#define KOKKOS_CLAMP_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { + +template <class T> +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi) { + KOKKOS_EXPECTS(!(hi < lo)); + return (value < lo) ? lo : (hi < value) ? hi : value; +} + +template <class T, class ComparatorType> +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi, + ComparatorType comp) { + KOKKOS_EXPECTS(!comp(hi, lo)); + return comp(value, lo) ? lo : comp(hi, value) ? hi : value; +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp index 4d405116ccff5772513209cd300ef57dfb439b9c..8233c30b243ce0cda1a91012f3ec91e2dba08736 100644 --- a/packages/kokkos/core/src/Kokkos_Complex.hpp +++ b/packages/kokkos/core/src/Kokkos_Complex.hpp @@ -28,6 +28,7 @@ #include <complex> #include <type_traits> #include <iosfwd> +#include <tuple> namespace Kokkos { @@ -69,9 +70,8 @@ class complex& operator=(const complex&) noexcept = default; /// \brief Conversion constructor from compatible RType - template < - class RType, - std::enable_if_t<std::is_convertible<RType, RealType>::value, int> = 0> + template <class RType, + std::enable_if_t<std::is_convertible_v<RType, RealType>, int> = 0> KOKKOS_INLINE_FUNCTION complex(const complex<RType>& other) noexcept // Intentionally do the conversions implicitly here so that users don't // get any warnings about narrowing, etc., that they would expect to get @@ -256,11 +256,16 @@ class return *this; } + template <size_t I, typename RT> + friend constexpr const RT& get(const complex<RT>&) noexcept; + + template <size_t I, typename RT> + friend constexpr const RT&& get(const complex<RT>&&) noexcept; + #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. - template < - class RType, - std::enable_if_t<std::is_convertible<RType, RealType>::value, int> = 0> + template <class RType, + std::enable_if_t<std::is_convertible_v<RType, RealType>, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex(const volatile complex<RType>& src) noexcept // Intentionally do the conversions implicitly here so that users don't @@ -289,7 +294,7 @@ class // vl = r; // vl = cr; template <class Complex, - std::enable_if_t<std::is_same<Complex, complex>::value, int> = 0> + std::enable_if_t<std::is_same_v<Complex, complex>, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( const Complex& src) volatile noexcept { re_ = src.re_; @@ -312,7 +317,7 @@ class // vl = vr; // vl = cvr; template <class Complex, - std::enable_if_t<std::is_same<Complex, complex>::value, int> = 0> + std::enable_if_t<std::is_same_v<Complex, complex>, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( const volatile Complex& src) volatile noexcept { re_ = src.re_; @@ -334,7 +339,7 @@ class // l = cvr; // template <class Complex, - std::enable_if_t<std::is_same<Complex, complex>::value, int> = 0> + std::enable_if_t<std::is_same_v<Complex, complex>, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( const volatile Complex& src) noexcept { re_ = src.re_; @@ -423,6 +428,75 @@ class #endif // KOKKOS_ENABLE_DEPRECATED_CODE_4 }; +} // namespace Kokkos + +// Tuple protocol for complex based on https://wg21.link/P2819R2 (voted into +// the C++26 working draft on 2023-11) + +template <typename RealType> +struct std::tuple_size<Kokkos::complex<RealType>> + : std::integral_constant<size_t, 2> {}; + +template <size_t I, typename RealType> +struct std::tuple_element<I, Kokkos::complex<RealType>> { + static_assert(I < 2); + using type = RealType; +}; + +namespace Kokkos { + +// get<...>(...) defined here so as not to be hidden friends, as per P2819R2 + +template <size_t I, typename RealType> +KOKKOS_FUNCTION constexpr RealType& get(complex<RealType>& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return z.real(); + else + return z.imag(); +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template <size_t I, typename RealType> +KOKKOS_FUNCTION constexpr RealType&& get(complex<RealType>&& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return std::move(z.real()); + else + return std::move(z.imag()); +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template <size_t I, typename RealType> +KOKKOS_FUNCTION constexpr const RealType& get( + const complex<RealType>& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return z.re_; + else + return z.im_; +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template <size_t I, typename RealType> +KOKKOS_FUNCTION constexpr const RealType&& get( + const complex<RealType>&& z) noexcept { + static_assert(I < 2); + if constexpr (I == 0) + return std::move(z.re_); + else + return std::move(z.im_); +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + //============================================================================== // <editor-fold desc="Equality and inequality"> {{{1 @@ -463,7 +537,7 @@ inline bool operator==(complex<RealType1> const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t<std::is_convertible<RealType2, RealType1>::value, int> = 0> + std::enable_if_t<std::is_convertible_v<RealType2, RealType1>, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t<RealType1, RealType2>; @@ -475,7 +549,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t<std::is_convertible<RealType1, RealType2>::value, int> = 0> + std::enable_if_t<std::is_convertible_v<RealType1, RealType2>, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, complex<RealType2> const& y) noexcept { using common_type = std::common_type_t<RealType1, RealType2>; @@ -514,7 +588,7 @@ inline bool operator!=(complex<RealType1> const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t<std::is_convertible<RealType2, RealType1>::value, int> = 0> + std::enable_if_t<std::is_convertible_v<RealType2, RealType1>, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t<RealType1, RealType2>; @@ -526,7 +600,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t<std::is_convertible<RealType1, RealType2>::value, int> = 0> + std::enable_if_t<std::is_convertible_v<RealType1, RealType2>, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, complex<RealType2> const& y) noexcept { using common_type = std::common_type_t<RealType1, RealType2>; @@ -702,16 +776,14 @@ KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x, return x == T() ? T() : exp(y * log(x)); } -template <class T, class U, - class = std::enable_if_t<std::is_arithmetic<T>::value>> +template <class T, class U, class = std::enable_if_t<std::is_arithmetic_v<T>>> KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow( const T& x, const complex<U>& y) { using type = Impl::promote_2_t<T, U>; return pow(type(x), complex<type>(y)); } -template <class T, class U, - class = std::enable_if_t<std::is_arithmetic<U>::value>> +template <class T, class U, class = std::enable_if_t<std::is_arithmetic_v<U>>> KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(const complex<T>& x, const U& y) { using type = Impl::promote_2_t<T, U>; diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp index df78a644a034deb6689b640f35035d36f9378349..0bfb9eb5fa404f6e3b6f2959b2f3bddeb567c91e 100644 --- a/packages/kokkos/core/src/Kokkos_Concepts.hpp +++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp @@ -41,8 +41,7 @@ struct Dynamic {}; // Schedule Wrapper Type template <class T> struct Schedule { - static_assert(std::is_same<T, Static>::value || - std::is_same<T, Dynamic>::value, + static_assert(std::is_same_v<T, Static> || std::is_same_v<T, Dynamic>, "Kokkos: Invalid Schedule<> type."); using schedule_type = Schedule; using type = T; @@ -51,7 +50,7 @@ struct Schedule { // Specify Iteration Index Type template <typename T> struct IndexType { - static_assert(std::is_integral<T>::value, "Kokkos: Invalid IndexType<>."); + static_assert(std::is_integral_v<T>, "Kokkos: Invalid IndexType<>."); using index_type = IndexType; using type = T; }; @@ -139,8 +138,8 @@ namespace Kokkos { \ public: \ static constexpr bool value = \ - std::is_base_of<detected_t<have_t, T>, T>::value || \ - std::is_base_of<detected_t<have_type_t, T>, T>::value; \ + std::is_base_of_v<detected_t<have_t, T>, T> || \ + std::is_base_of_v<detected_t<have_type_t, T>, T>; \ constexpr operator bool() const noexcept { return value; } \ }; \ template <typename T> \ @@ -292,44 +291,6 @@ struct is_space { using execution_space = typename is_exe::space; using memory_space = typename is_mem::space; - - // For backward compatibility, deprecated in favor of - // Kokkos::Impl::HostMirror<S>::host_mirror_space - - private: - // The actual definitions for host_memory_space and host_execution_spaces are - // in do_not_use_host_memory_space and do_not_use_host_execution_space to be - // able to use them within this class without deprecation warnings. - using do_not_use_host_memory_space = std::conditional_t< - std::is_same<memory_space, Kokkos::HostSpace>::value -#if defined(KOKKOS_ENABLE_CUDA) - || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value || - std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value -#elif defined(KOKKOS_ENABLE_HIP) - || std::is_same<memory_space, Kokkos::HIPHostPinnedSpace>::value || - std::is_same<memory_space, Kokkos::HIPManagedSpace>::value -#elif defined(KOKKOS_ENABLE_SYCL) - || std::is_same<memory_space, - Kokkos::Experimental::SYCLSharedUSMSpace>::value || - std::is_same<memory_space, - Kokkos::Experimental::SYCLHostUSMSpace>::value -#endif - , - memory_space, Kokkos::HostSpace>; - - using do_not_use_host_execution_space = std::conditional_t< -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same<execution_space, Kokkos::Cuda>::value || -#elif defined(KOKKOS_ENABLE_HIP) - std::is_same<execution_space, Kokkos::HIP>::value || -#elif defined(KOKKOS_ENABLE_SYCL) - std::is_same<execution_space, Kokkos::Experimental::SYCL>::value || -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - std::is_same<execution_space, - Kokkos::Experimental::OpenMPTarget>::value || -#endif - false, - Kokkos::DefaultHostExecutionSpace, execution_space>; }; } // namespace Kokkos @@ -357,7 +318,7 @@ struct MemorySpaceAccess { * 2. All execution spaces that can access DstMemorySpace can also access * SrcMemorySpace. */ - enum { assignable = std::is_same<DstMemorySpace, SrcMemorySpace>::value }; + enum { assignable = std::is_same_v<DstMemorySpace, SrcMemorySpace> }; /**\brief For all DstExecSpace::memory_space == DstMemorySpace * DstExecSpace can access SrcMemorySpace. @@ -442,7 +403,7 @@ struct SpaceAccessibility { // If same memory space or not accessible use the AccessSpace // else construct a device with execution space and memory space. using space = std::conditional_t< - std::is_same<typename AccessSpace::memory_space, MemorySpace>::value || + std::is_same_v<typename AccessSpace::memory_space, MemorySpace> || !exe_access::accessible, AccessSpace, Kokkos::Device<typename AccessSpace::execution_space, MemorySpace>>; diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp index a0ca55be7043e8157d5cb44580e9be5f97149f16..7da59aa4e419c6676e22f109f763cb17d800f407 100644 --- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp @@ -22,6 +22,7 @@ static_assert(false, #ifndef KOKKOS_COPYVIEWS_HPP_ #define KOKKOS_COPYVIEWS_HPP_ #include <string> +#include <sstream> #include <Kokkos_Parallel.hpp> #include <KokkosExp_MDRangePolicy.hpp> #include <Kokkos_Layout.hpp> @@ -220,10 +221,12 @@ struct ViewFill<ViewType, Layout, ExecSpace, 7, iType> { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewFill-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, - {a.extent(0), a.extent(1), a.extent(2), - a.extent(3), a.extent(5), a.extent(6)}), + {a.extent(0), a.extent(1), a.extent(3), + a.extent(4), a.extent(5), a.extent(6)}), *this); } @@ -248,6 +251,8 @@ struct ViewFill<ViewType, Layout, ExecSpace, 8, iType> { ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) : a(a_), val(val_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewFill-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -292,9 +297,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 2, iType> { ViewTypeA a; ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -322,9 +329,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 3, iType> { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -353,9 +362,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 4, iType> { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<4, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -385,9 +396,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 5, iType> { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<5, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -417,9 +430,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 6, iType> { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -449,9 +464,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 7, iType> { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -460,6 +477,8 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 7, iType> { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 7D views + // Iterate separately over extent(2) Kokkos::parallel_for("Kokkos::ViewCopy-7D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -482,9 +501,11 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 8, iType> { ViewTypeB b; static const Kokkos::Iterate outer_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::outer_iteration_pattern; static const Kokkos::Iterate inner_iteration_pattern = - Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + Kokkos::Impl::layout_iterate_type_selector< + Layout>::inner_iteration_pattern; using iterate_type = Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; using policy_type = @@ -493,6 +514,8 @@ struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 8, iType> { ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, const ExecSpace space = ExecSpace()) : a(a_), b(b_) { + // MDRangePolicy is not supported for 8D views + // Iterate separately over extent(2) and extent(4) Kokkos::parallel_for("Kokkos::ViewCopy-8D", policy_type(space, {0, 0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(3), @@ -538,24 +561,20 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled<typename DstType::array_layout>::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same<typename DstType::array_layout, - Kokkos::LayoutRight>::value) { + if (std::is_same_v<typename DstType::array_layout, Kokkos::LayoutRight>) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same<typename DstType::array_layout, - Kokkos::LayoutLeft>::value) { + } else if (std::is_same_v<typename DstType::array_layout, + Kokkos::LayoutLeft>) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same<typename DstType::array_layout, - Kokkos::LayoutStride>::value) { + } else if (std::is_same_v<typename DstType::array_layout, + Kokkos::LayoutStride>) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same<typename DstType::execution_space::array_layout, - Kokkos::LayoutRight>::value) + if (std::is_same_v<typename DstType::execution_space::array_layout, + Kokkos::LayoutRight>) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -612,36 +631,37 @@ void view_copy(const DstType& dst, const SrcType& src) { }; if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { - std::string message( - "Error: Kokkos::deep_copy with no available copy mechanism: "); - message += src.label(); - message += " to "; - message += dst.label(); - Kokkos::Impl::throw_runtime_exception(message); + std::ostringstream ss; + ss << "Error: Kokkos::deep_copy with no available copy mechanism: " + << "from source view (\"" << src.label() << "\") to destination view (\"" + << dst.label() << "\").\n" + << "There is no common execution space that can access both source's " + "space\n" + << "(" << src_memory_space().name() << ") and destination's space (" + << dst_memory_space().name() << "), " + << "so source and destination\n" + << "must be contiguous and have the same layout.\n"; + Kokkos::Impl::throw_runtime_exception(ss.str()); } // Figure out iteration order in case we need it int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (Kokkos::is_layouttiled<typename DstType::array_layout>::value) { - iterate = Kokkos::layout_iterate_type_selector< - typename DstType::array_layout>::outer_iteration_pattern; - } else if (std::is_same<typename DstType::array_layout, - Kokkos::LayoutRight>::value) { + if (std::is_same_v<typename DstType::array_layout, Kokkos::LayoutRight>) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same<typename DstType::array_layout, - Kokkos::LayoutLeft>::value) { + } else if (std::is_same_v<typename DstType::array_layout, + Kokkos::LayoutLeft>) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same<typename DstType::array_layout, - Kokkos::LayoutStride>::value) { + } else if (std::is_same_v<typename DstType::array_layout, + Kokkos::LayoutStride>) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same<typename DstType::execution_space::array_layout, - Kokkos::LayoutRight>::value) + if (std::is_same_v<typename DstType::execution_space::array_layout, + Kokkos::LayoutRight>) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1328,41 +1348,43 @@ inline void contiguous_fill( } // Default implementation for execution spaces that don't provide a definition -template <typename ExecutionSpace, class ViewType> +template <typename ExecutionSpace> struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst, - typename ViewType::const_value_type& value) { - contiguous_fill(exec_space, dst, value); - } - - ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) { - contiguous_fill(ExecutionSpace(), dst, value); + ZeroMemset(const ExecutionSpace& exec_space, void* dst, size_t cnt) { + contiguous_fill( + exec_space, + Kokkos::View<std::byte*, ExecutionSpace, Kokkos::MemoryUnmanaged>( + static_cast<std::byte*>(dst), cnt), + std::byte{}); } }; template <typename ExecutionSpace, class DT, class... DP> inline std::enable_if_t< - std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits<DT, DP...>::value_type>::value> + std::is_trivial_v<typename ViewTraits<DT, DP...>::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value) { -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) - ZeroMemset<ExecutionSpace, View<DT, DP...>>(exec_space, dst, value); - else + // With OpenMP, using memset has significant performance issues. + if (Impl::is_zero_byte(value) +#ifdef KOKKOS_ENABLE_OPENMP + && !std::is_same_v<ExecutionSpace, Kokkos::OpenMP> #endif + ) + // FIXME intel/19 icpc fails to deduce template parameter here, + // resulting in compilation errors; explicitly passing the template + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset<ExecutionSpace>( + exec_space, dst.data(), + dst.size() * sizeof(typename ViewTraits<DT, DP...>::value_type)); + else contiguous_fill(exec_space, dst, value); } template <typename ExecutionSpace, class DT, class... DP> inline std::enable_if_t< - !(std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits<DT, DP...>::value_type>::value)> + !std::is_trivial_v<typename ViewTraits<DT, DP...>::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value) { @@ -1371,30 +1393,32 @@ contiguous_fill_or_memset( template <class DT, class... DP> inline std::enable_if_t< - std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits<DT, DP...>::value_type>::value> + std::is_trivial_v<typename ViewTraits<DT, DP...>::value_type>> contiguous_fill_or_memset( const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value) { using ViewType = View<DT, DP...>; using exec_space_type = typename ViewType::execution_space; + exec_space_type exec; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset<exec_space_type, View<DT, DP...>>(dst, value); + // FIXME intel/19 icpc fails to deduce template parameter here, + // resulting in compilation errors; explicitly passing the template + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset<exec_space_type>( + exec, dst.data(), dst.size() * sizeof(typename ViewType::value_type)); else #endif - contiguous_fill(exec_space_type(), dst, value); + contiguous_fill(exec, dst, value); } template <class DT, class... DP> inline std::enable_if_t< - !(std::is_trivial<typename ViewTraits<DT, DP...>::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits<DT, DP...>::value_type>::value)> + !std::is_trivial_v<typename ViewTraits<DT, DP...>::value_type>> contiguous_fill_or_memset( const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value) { @@ -1410,8 +1434,8 @@ template <class DT, class... DP> inline void deep_copy( const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { using ViewType = View<DT, DP...>; using exec_space_type = typename ViewType::execution_space; @@ -1433,8 +1457,8 @@ inline void deep_copy( } Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence"); - static_assert(std::is_same<typename ViewType::non_const_value_type, - typename ViewType::value_type>::value, + static_assert(std::is_same_v<typename ViewType::non_const_value_type, + typename ViewType::value_type>, "deep_copy requires non-const type"); // If contiguous we can simply do a 1D flat loop or use memset @@ -1451,21 +1475,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same<typename ViewType::array_layout, - Kokkos::LayoutRight>::value) { + if (std::is_same_v<typename ViewType::array_layout, Kokkos::LayoutRight>) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same<typename ViewType::array_layout, - Kokkos::LayoutLeft>::value) { + } else if (std::is_same_v<typename ViewType::array_layout, + Kokkos::LayoutLeft>) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same<typename ViewType::array_layout, - Kokkos::LayoutStride>::value) { + } else if (std::is_same_v<typename ViewType::array_layout, + Kokkos::LayoutStride>) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same<typename ViewType::execution_space::array_layout, - Kokkos::LayoutRight>::value) + if (std::is_same_v<typename ViewType::execution_space::array_layout, + Kokkos::LayoutRight>) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1508,8 +1531,8 @@ template <class ST, class... SP> inline void deep_copy( typename ViewTraits<ST, SP...>::non_const_value_type& dst, const View<ST, SP...>& src, - std::enable_if_t<std::is_same<typename ViewTraits<ST, SP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<ST, SP...>::specialize, + void>>* = nullptr) { using src_traits = ViewTraits<ST, SP...>; using src_memory_space = typename src_traits::memory_space; @@ -1545,8 +1568,8 @@ template <class DT, class... DP, class ST, class... SP> inline void deep_copy( const View<DT, DP...>& dst, const View<ST, SP...>& src, std::enable_if_t< - (std::is_void<typename ViewTraits<DT, DP...>::specialize>::value && - std::is_void<typename ViewTraits<ST, SP...>::specialize>::value && + (std::is_void_v<typename ViewTraits<DT, DP...>::specialize> && + std::is_void_v<typename ViewTraits<ST, SP...>::specialize> && (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) && unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>* = nullptr) { using dst_type = View<DT, DP...>; @@ -1556,8 +1579,8 @@ inline void deep_copy( using dst_memory_space = typename dst_type::memory_space; using src_memory_space = typename src_type::memory_space; - static_assert(std::is_same<typename dst_type::value_type, - typename src_type::non_const_value_type>::value, + static_assert(std::is_same_v<typename dst_type::value_type, + typename src_type::non_const_value_type>, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -1597,8 +1620,8 @@ template <class DT, class... DP, class ST, class... SP> inline void deep_copy( const View<DT, DP...>& dst, const View<ST, SP...>& src, std::enable_if_t< - (std::is_void<typename ViewTraits<DT, DP...>::specialize>::value && - std::is_void<typename ViewTraits<ST, SP...>::specialize>::value && + (std::is_void_v<typename ViewTraits<DT, DP...>::specialize> && + std::is_void_v<typename ViewTraits<ST, SP...>::specialize> && (unsigned(ViewTraits<DT, DP...>::rank) != 0 || unsigned(ViewTraits<ST, SP...>::rank) != 0))>* = nullptr) { using dst_type = View<DT, DP...>; @@ -1610,8 +1633,8 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - static_assert(std::is_same<typename dst_type::value_type, - typename dst_type::non_const_value_type>::value, + static_assert(std::is_same_v<typename dst_type::value_type, + typename dst_type::non_const_value_type>, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -1741,10 +1764,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same<typename dst_type::value_type, - typename src_type::non_const_value_type>::value && - (std::is_same<typename dst_type::array_layout, - typename src_type::array_layout>::value || + if (std::is_same_v<typename dst_type::value_type, + typename src_type::non_const_value_type> && + (std::is_same_v<typename dst_type::array_layout, + typename src_type::array_layout> || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2160,8 +2183,8 @@ template <class TeamType, class DT, class... DP> void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const TeamType& team, const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()), [&](const int& i) { dst.data()[i] = value; }); } @@ -2170,8 +2193,8 @@ template <class DT, class... DP> void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const View<DT, DP...>& dst, typename ViewTraits<DT, DP...>::const_value_type& value, - std::enable_if_t<std::is_same<typename ViewTraits<DT, DP...>::specialize, - void>::value>* = nullptr) { + std::enable_if_t<std::is_same_v<typename ViewTraits<DT, DP...>::specialize, + void>>* = nullptr) { for (size_t i = 0; i < dst.span(); ++i) { dst.data()[i] = value; } @@ -2537,13 +2560,13 @@ inline void deep_copy( typename ViewTraits<DT, DP...>::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space<ExecSpace>::value && - std::is_void<typename ViewTraits<DT, DP...>::specialize>::value && + std::is_void_v<typename ViewTraits<DT, DP...>::specialize> && Kokkos::SpaceAccessibility<ExecSpace, typename ViewTraits<DT, DP...>:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits<DT, DP...>; - static_assert(std::is_same<typename dst_traits::non_const_value_type, - typename dst_traits::value_type>::value, + static_assert(std::is_same_v<typename dst_traits::non_const_value_type, + typename dst_traits::value_type>, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2563,21 +2586,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same<typename ViewType::array_layout, - Kokkos::LayoutRight>::value) { + if (std::is_same_v<typename ViewType::array_layout, Kokkos::LayoutRight>) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same<typename ViewType::array_layout, - Kokkos::LayoutLeft>::value) { + } else if (std::is_same_v<typename ViewType::array_layout, + Kokkos::LayoutLeft>) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same<typename ViewType::array_layout, - Kokkos::LayoutStride>::value) { + } else if (std::is_same_v<typename ViewType::array_layout, + Kokkos::LayoutStride>) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same<typename ViewType::execution_space::array_layout, - Kokkos::LayoutRight>::value) + if (std::is_same_v<typename ViewType::execution_space::array_layout, + Kokkos::LayoutRight>) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2618,13 +2640,13 @@ inline void deep_copy( typename ViewTraits<DT, DP...>::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space<ExecSpace>::value && - std::is_void<typename ViewTraits<DT, DP...>::specialize>::value && + std::is_void_v<typename ViewTraits<DT, DP...>::specialize> && !Kokkos::SpaceAccessibility<ExecSpace, typename ViewTraits<DT, DP...>:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits<DT, DP...>; - static_assert(std::is_same<typename dst_traits::non_const_value_type, - typename dst_traits::value_type>::value, + static_assert(std::is_same_v<typename dst_traits::non_const_value_type, + typename dst_traits::value_type>, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2665,8 +2687,8 @@ inline void deep_copy( typename ViewTraits<ST, SP...>::non_const_value_type& dst, const View<ST, SP...>& src, std::enable_if_t<Kokkos::is_execution_space<ExecSpace>::value && - std::is_same<typename ViewTraits<ST, SP...>::specialize, - void>::value>* = nullptr) { + std::is_same_v<typename ViewTraits<ST, SP...>::specialize, + void>>* = nullptr) { using src_traits = ViewTraits<ST, SP...>; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -2703,8 +2725,8 @@ inline void deep_copy( const View<ST, SP...>& src, std::enable_if_t< (Kokkos::is_execution_space<ExecSpace>::value && - std::is_void<typename ViewTraits<DT, DP...>::specialize>::value && - std::is_void<typename ViewTraits<ST, SP...>::specialize>::value && + std::is_void_v<typename ViewTraits<DT, DP...>::specialize> && + std::is_void_v<typename ViewTraits<ST, SP...>::specialize> && (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) && unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>* = nullptr) { using src_traits = ViewTraits<ST, SP...>; @@ -2712,8 +2734,8 @@ inline void deep_copy( using src_memory_space = typename src_traits::memory_space; using dst_memory_space = typename dst_traits::memory_space; - static_assert(std::is_same<typename dst_traits::value_type, - typename src_traits::non_const_value_type>::value, + static_assert(std::is_same_v<typename dst_traits::value_type, + typename src_traits::non_const_value_type>, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2753,15 +2775,15 @@ inline void deep_copy( const View<ST, SP...>& src, std::enable_if_t< (Kokkos::is_execution_space<ExecSpace>::value && - std::is_void<typename ViewTraits<DT, DP...>::specialize>::value && - std::is_void<typename ViewTraits<ST, SP...>::specialize>::value && + std::is_void_v<typename ViewTraits<DT, DP...>::specialize> && + std::is_void_v<typename ViewTraits<ST, SP...>::specialize> && (unsigned(ViewTraits<DT, DP...>::rank) != 0 || unsigned(ViewTraits<ST, SP...>::rank) != 0))>* = nullptr) { using dst_type = View<DT, DP...>; using src_type = View<ST, SP...>; - static_assert(std::is_same<typename dst_type::value_type, - typename dst_type::non_const_value_type>::value, + static_assert(std::is_same_v<typename dst_type::value_type, + typename dst_type::non_const_value_type>, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -2891,10 +2913,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same<typename dst_type::value_type, - typename src_type::non_const_value_type>::value && - (std::is_same<typename dst_type::array_layout, - typename src_type::array_layout>::value || + if (std::is_same_v<typename dst_type::value_type, + typename src_type::non_const_value_type> && + (std::is_same_v<typename dst_type::array_layout, + typename src_type::array_layout> || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2963,11 +2985,11 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent, /** \brief Resize a view with copying old data to new data at the corresponding * indices. */ template <class T, class... P, class... ViewCtorArgs> -inline typename std::enable_if< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value>::type +inline std::enable_if_t< + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>> impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, Kokkos::View<T, P...>& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, @@ -3017,10 +3039,10 @@ impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>> resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3035,10 +3057,10 @@ resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, template <class T, class... P> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>> resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3054,10 +3076,10 @@ template <class I, class T, class... P> inline std::enable_if_t< (Impl::is_view_ctor_property<I>::value || Kokkos::is_execution_space<I>::value) && - (std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>)> resize(const I& arg_prop, Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3072,13 +3094,12 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v, template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutStride>> impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, Kokkos::View<T, P...>& v, const typename Kokkos::View<T, P...>::array_layout& layout) { @@ -3119,13 +3140,12 @@ impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, // the same as the existing one. template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - !(std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)> + !(std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutStride>)> impl_resize(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, Kokkos::View<T, P...>& v, const typename Kokkos::View<T, P...>::array_layout& layout) { @@ -3189,10 +3209,10 @@ inline void resize(Kokkos::View<T, P...>& v, /** \brief Resize a view with discarding old data. */ template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>> impl_realloc(Kokkos::View<T, P...>& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, const size_t n6, const size_t n7, @@ -3220,7 +3240,10 @@ impl_realloc(Kokkos::View<T, P...>& v, const size_t n0, const size_t n1, v = view_type(); // Best effort to deallocate in case no other view refers // to the shared allocation v = view_type(arg_prop_copy, n0, n1, n2, n3, n4, n5, n6, n7); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop); @@ -3232,10 +3255,10 @@ impl_realloc(Kokkos::View<T, P...>& v, const size_t n0, const size_t n1, template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>> realloc(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3251,10 +3274,10 @@ realloc(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, template <class T, class... P> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>> realloc(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3270,10 +3293,10 @@ realloc(Kokkos::View<T, P...>& v, template <class I, class T, class... P> inline std::enable_if_t< Impl::is_view_ctor_property<I>::value && - (std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>)> realloc(const I& arg_prop, Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3288,13 +3311,12 @@ realloc(const I& arg_prop, Kokkos::View<T, P...>& v, template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value> + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutStride>> impl_realloc(Kokkos::View<T, P...>& v, const typename Kokkos::View<T, P...>::array_layout& layout, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { @@ -3316,7 +3338,10 @@ impl_realloc(Kokkos::View<T, P...>& v, if (v.layout() != layout) { v = view_type(); // Deallocate first, if the only view to allocation v = view_type(arg_prop, layout); - } else if (alloc_prop_input::initialize) { + return; + } + + if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { const auto& exec_space = Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop); @@ -3331,13 +3356,12 @@ impl_realloc(Kokkos::View<T, P...>& v, // the same as the existing one. template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - !(std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename Kokkos::View<T, P...>::array_layout, - Kokkos::LayoutStride>::value || - is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)> + !(std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutStride>)> impl_realloc(Kokkos::View<T, P...>& v, const typename Kokkos::View<T, P...>::array_layout& layout, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { @@ -3402,7 +3426,7 @@ struct MirrorViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value + std::is_same_v<memory_space, typename src_view_type::memory_space> }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -3417,26 +3441,7 @@ struct MirrorViewType { std::conditional_t<is_same_memspace, src_view_type, dest_view_type>; }; -template <class Space, class T, class... P> -struct MirrorType { - // The incoming view_type - using src_view_type = typename Kokkos::View<T, P...>; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same<memory_space, typename src_view_type::memory_space>::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::View<data_type, array_layout, Space>; -}; - +// collection of static asserts for create_mirror and create_mirror_view template <class... ViewCtorArgs> void check_view_ctor_args_create_mirror() { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; @@ -3455,232 +3460,231 @@ void check_view_ctor_args_create_mirror() { "not explicitly allow padding!"); } +// create a mirror +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t<!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space, - typename Kokkos::View<T, P...>::HostMirror> -create_mirror(const Kokkos::View<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - using src_type = View<T, P...>; - using dst_type = typename src_type::HostMirror; - +inline auto create_mirror(const Kokkos::View<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { check_view_ctor_args_create_mirror<ViewCtorArgs...>(); auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return dst_type(prop_copy, src.layout()); -} - -// Create a mirror in a new space (specialization for different space) -template <class T, class... P, class... ViewCtorArgs, - class Enable = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -auto create_mirror(const Kokkos::View<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - check_view_ctor_args_create_mirror<ViewCtorArgs...>(); - - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string(src.label()).append("_mirror")); - using alloc_prop = decltype(prop_copy); - - return typename Impl::MirrorType<typename alloc_prop::memory_space, T, - P...>::view_type(prop_copy, src.layout()); + if constexpr (Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + using memory_space = typename decltype(prop_copy)::memory_space; + using dst_type = + typename Impl::MirrorViewType<memory_space, T, P...>::dest_view_type; + return dst_type(prop_copy, src.layout()); + } else { + using dst_type = typename View<T, P...>::HostMirror; + return dst_type(prop_copy, src.layout()); + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } } // namespace Impl -template <class T, class... P> -std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value, - typename Kokkos::View<T, P...>::HostMirror> -create_mirror(Kokkos::View<T, P...> const& v) { - return Impl::create_mirror(v, Impl::ViewCtorProp<>{}); +// public interface +template <class T, class... P, + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +auto create_mirror(Kokkos::View<T, P...> const& src) { + return Impl::create_mirror(src, Impl::ViewCtorProp<>{}); } -template <class T, class... P> -std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value, - typename Kokkos::View<T, P...>::HostMirror> -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, - Kokkos::View<T, P...> const& v) { - return Impl::create_mirror(v, view_alloc(wi)); +// public interface that accepts a without initializing flag +template <class T, class... P, + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, + Kokkos::View<T, P...> const& src) { + return Impl::create_mirror(src, view_alloc(wi)); } +// public interface that accepts a space template <class Space, class T, class... P, - typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value, - typename Impl::MirrorType<Space, T, P...>::view_type> -create_mirror(Space const&, Kokkos::View<T, P...> const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{})); + typename Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +auto create_mirror(Space const&, Kokkos::View<T, P...> const& src) { + return Impl::create_mirror(src, view_alloc(typename Space::memory_space{})); } +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc template <class T, class... P, class... ViewCtorArgs, - typename Enable = std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> auto create_mirror(Impl::ViewCtorProp<ViewCtorArgs...> const& arg_prop, - Kokkos::View<T, P...> const& v) { - return Impl::create_mirror(v, arg_prop); -} - -template <class T, class... P, class... ViewCtorArgs> -std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space, - typename Kokkos::View<T, P...>::HostMirror> -create_mirror(Impl::ViewCtorProp<ViewCtorArgs...> const& arg_prop, - Kokkos::View<T, P...> const& v) { - return Impl::create_mirror(v, arg_prop); + Kokkos::View<T, P...> const& src) { + return Impl::create_mirror(src, arg_prop); } +// public interface that accepts a space and a without initializing flag template <class Space, class T, class... P, - typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -std::enable_if_t<std::is_void<typename ViewTraits<T, P...>::specialize>::value, - typename Impl::MirrorType<Space, T, P...>::view_type> -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View<T, P...> const& v) { - return Impl::create_mirror(v, view_alloc(typename Space::memory_space{}, wi)); + typename Enable = std::enable_if_t< + Kokkos::is_space<Space>::value && + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> +auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, Space const&, + Kokkos::View<T, P...> const& src) { + return Impl::create_mirror(src, + view_alloc(typename Space::memory_space{}, wi)); } namespace Impl { -template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - (std::is_same< - typename Kokkos::View<T, P...>::memory_space, - typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View<T, P...>::data_type, - typename Kokkos::View<T, P...>::HostMirror::data_type>::value), - typename Kokkos::View<T, P...>::HostMirror> -create_mirror_view(const Kokkos::View<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { - check_view_ctor_args_create_mirror<ViewCtorArgs...>(); - return src; -} +// choose a `Kokkos::create_mirror` adapted for the provided view and the +// provided arguments +template <class View, class... ViewCtorArgs> +inline auto choose_create_mirror( + const View& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { + // Due to the fact that users can overload `Kokkos::create_mirror`, but also + // that they may not have implemented all of its different possible + // variations, this function chooses the correct private or public version of + // it to call. + // This helper should be used by any overload of + // `Kokkos::Impl::create_mirror_view`. + + if constexpr (std::is_void_v<typename View::traits::specialize>) { + // if the view is not specialized, just call the Impl function + + // using ADL to find the later defined overload of the function + using namespace Kokkos::Impl; + + return create_mirror(src, arg_prop); + } else { + // otherwise, recreate the public call + using ViewProp = Impl::ViewCtorProp<ViewCtorArgs...>; + + // using ADL to find the later defined overload of the function + using namespace Kokkos; + + if constexpr (sizeof...(ViewCtorArgs) == 0) { + // if there are no view constructor args, call the specific public + // function + return create_mirror(src); + } else if constexpr (sizeof...(ViewCtorArgs) == 1 && + ViewProp::has_memory_space) { + // if there is one view constructor arg and it has a memory space, call + // the specific public function + return create_mirror(typename ViewProp::memory_space{}, src); + } else if constexpr (sizeof...(ViewCtorArgs) == 1 && + !ViewProp::initialize) { + // if there is one view constructor arg and it has a without initializing + // mark, call the specific public function + return create_mirror(typename Kokkos::Impl::WithoutInitializing_t{}, src); + } else if constexpr (sizeof...(ViewCtorArgs) == 2 && + ViewProp::has_memory_space && !ViewProp::initialize) { + // if there is two view constructor args and they have a memory space and + // a without initializing mark, call the specific public function + return create_mirror(typename Kokkos::Impl::WithoutInitializing_t{}, + typename ViewProp::memory_space{}, src); + } else { + // if there are other constructor args, call the generic public function -template <class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && - !(std::is_same<typename Kokkos::View<T, P...>::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View<T, P...>::data_type, - typename Kokkos::View<T, P...>::HostMirror::data_type>::value), - typename Kokkos::View<T, P...>::HostMirror> -create_mirror_view(const Kokkos::View<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); -} - -// Create a mirror view in a new space (specialization for same space) -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -std::enable_if_t<Impl::MirrorViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { - check_view_ctor_args_create_mirror<ViewCtorArgs...>(); - return src; -} + // Beware, there are some libraries using Kokkos that don't implement + // this overload (hence the reason for this present function to exist). + return create_mirror(arg_prop, src); + } + } -// Create a mirror view in a new space (specialization for different space) -template <class T, class... P, class... ViewCtorArgs, - class = std::enable_if_t< - Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> -std::enable_if_t<!Impl::MirrorViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::is_same_memspace, - typename Impl::MirrorViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, - T, P...>::view_type> -create_mirror_view(const Kokkos::View<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return Kokkos::Impl::create_mirror(src, arg_prop); +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } -} // namespace Impl -template <class T, class... P> -std::enable_if_t< - std::is_same< - typename Kokkos::View<T, P...>::memory_space, - typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View<T, P...>::data_type, - typename Kokkos::View<T, P...>::HostMirror::data_type>::value, - typename Kokkos::View<T, P...>::HostMirror> -create_mirror_view(const Kokkos::View<T, P...>& src) { - return src; +// create a mirror view +// private interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class T, class... P, class... ViewCtorArgs> +inline auto create_mirror_view( + const Kokkos::View<T, P...>& src, + [[maybe_unused]] const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { + if constexpr (!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space) { + if constexpr (std::is_same_v<typename Kokkos::View<T, P...>::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename Kokkos::View<T, P...>::data_type, + typename Kokkos::View<T, P...>::HostMirror::data_type>) { + check_view_ctor_args_create_mirror<ViewCtorArgs...>(); + return typename Kokkos::View<T, P...>::HostMirror(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } else { + if constexpr (Impl::MirrorViewType<typename Impl::ViewCtorProp< + ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace) { + check_view_ctor_args_create_mirror<ViewCtorArgs...>(); + return typename Impl::MirrorViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::view_type(src); + } else { + return Kokkos::Impl::choose_create_mirror(src, arg_prop); + } + } +#if defined(KOKKOS_COMPILER_INTEL) || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } +} // namespace Impl +// public interface template <class T, class... P> -std::enable_if_t< - !(std::is_same< - typename Kokkos::View<T, P...>::memory_space, - typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View<T, P...>::data_type, - typename Kokkos::View<T, P...>::HostMirror::data_type>::value), - typename Kokkos::View<T, P...>::HostMirror> -create_mirror_view(const Kokkos::View<T, P...>& src) { - return Kokkos::create_mirror(src); +auto create_mirror_view(const Kokkos::View<T, P...>& src) { + return Impl::create_mirror_view(src, view_alloc()); } +// public interface that accepts a without initializing flag template <class T, class... P> -typename Kokkos::View<T, P...>::HostMirror create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Kokkos::View<T, P...> const& v) { - return Impl::create_mirror_view(v, view_alloc(wi)); +auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, + Kokkos::View<T, P...> const& src) { + return Impl::create_mirror_view(src, view_alloc(wi)); } -// FIXME_C++17 Improve SFINAE here. +// public interface that accepts a space template <class Space, class T, class... P, class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( - const Space&, const Kokkos::View<T, P...>& src, - std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace>* = - nullptr) { - return src; -} - -// FIXME_C++17 Improve SFINAE here. -template <class Space, class T, class... P, - class Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( - const Space& space, const Kokkos::View<T, P...>& src, - std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace>* = - nullptr) { - return Kokkos::create_mirror(space, src); +auto create_mirror_view(const Space&, const Kokkos::View<T, P...>& src) { + return Impl::create_mirror_view(src, + view_alloc(typename Space::memory_space())); } +// public interface that accepts a space and a without initializing flag template <class Space, class T, class... P, typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> -typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Space const&, - Kokkos::View<T, P...> const& v) { +auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, Space const&, + Kokkos::View<T, P...> const& src) { return Impl::create_mirror_view( - v, view_alloc(typename Space::memory_space{}, wi)); + src, view_alloc(typename Space::memory_space{}, wi)); } -template <class T, class... P, class... ViewCtorArgs> +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class T, class... P, class... ViewCtorArgs, + typename = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> auto create_mirror_view(const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - const Kokkos::View<T, P...>& v) { - return Impl::create_mirror_view(v, arg_prop); + const Kokkos::View<T, P...>& src) { + return Impl::create_mirror_view(src, arg_prop); } -template <class... ViewCtorArgs, class T, class... P> -auto create_mirror_view_and_copy( - const Impl::ViewCtorProp<ViewCtorArgs...>&, - const Kokkos::View<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - Impl::MirrorViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace>* = nullptr) { +namespace Impl { + +// collection of static asserts for create_mirror_view_and_copy +template <class... ViewCtorArgs> +void check_view_ctor_args_create_mirror_view_and_copy() { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; + static_assert( alloc_prop_input::has_memory_space, "The view constructor arguments passed to " @@ -3693,52 +3697,53 @@ auto create_mirror_view_and_copy( "The view constructor arguments passed to " "Kokkos::create_mirror_view_and_copy must " "not explicitly allow padding!"); - - // same behavior as deep_copy(src, src) - if (!alloc_prop_input::has_execution_space) - fence( - "Kokkos::create_mirror_view_and_copy: fence before returning src view"); - return src; } -template <class... ViewCtorArgs, class T, class... P> +} // namespace Impl + +// create a mirror view and deep copy it +// public interface that accepts arbitrary view constructor args passed by a +// view_alloc +template <class... ViewCtorArgs, class T, class... P, + class Enable = std::enable_if_t< + std::is_void_v<typename ViewTraits<T, P...>::specialize>>> auto create_mirror_view_and_copy( - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, - const Kokkos::View<T, P...>& src, - std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value && - !Impl::MirrorViewType< - typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, - P...>::is_same_memspace>* = nullptr) { + [[maybe_unused]] const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop, + const Kokkos::View<T, P...>& src) { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; - static_assert( - alloc_prop_input::has_memory_space, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must include a memory space!"); - static_assert(!alloc_prop_input::has_pointer, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not include a pointer!"); - static_assert(!alloc_prop_input::allow_padding, - "The view constructor arguments passed to " - "Kokkos::create_mirror_view_and_copy must " - "not explicitly allow padding!"); - using Space = typename alloc_prop_input::memory_space; - using Mirror = typename Impl::MirrorViewType<Space, T, P...>::view_type; - - auto arg_prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, WithoutInitializing, - typename Space::execution_space{}); - - std::string& label = Impl::get_property<Impl::LabelTag>(arg_prop_copy); - if (label.empty()) label = src.label(); - auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; - if constexpr (alloc_prop_input::has_execution_space) { - deep_copy(Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop_copy), - mirror, src); - } else - deep_copy(mirror, src); - return mirror; + + Impl::check_view_ctor_args_create_mirror_view_and_copy<ViewCtorArgs...>(); + + if constexpr (Impl::MirrorViewType<typename alloc_prop_input::memory_space, T, + P...>::is_same_memspace) { + // same behavior as deep_copy(src, src) + if constexpr (!alloc_prop_input::has_execution_space) + fence( + "Kokkos::create_mirror_view_and_copy: fence before returning src " + "view"); + return src; + } else { + using Space = typename alloc_prop_input::memory_space; + using Mirror = typename Impl::MirrorViewType<Space, T, P...>::view_type; + + auto arg_prop_copy = Impl::with_properties_if_unset( + arg_prop, std::string{}, WithoutInitializing, + typename Space::execution_space{}); + + std::string& label = Impl::get_property<Impl::LabelTag>(arg_prop_copy); + if (label.empty()) label = src.label(); + auto mirror = typename Mirror::non_const_type{arg_prop_copy, src.layout()}; + if constexpr (alloc_prop_input::has_execution_space) { + deep_copy(Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop_copy), + mirror, src); + } else + deep_copy(mirror, src); + return mirror; + } +#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC) + __builtin_unreachable(); +#endif } // Previously when using auto here, the intel compiler 19.3 would @@ -3751,8 +3756,7 @@ create_mirror_view_and_copy( const Space&, const Kokkos::View<T, P...>& src, std::string const& name = "", std::enable_if_t< - std::is_void<typename ViewTraits<T, P...>::specialize>::value>* = - nullptr) { + std::is_void_v<typename ViewTraits<T, P...>::specialize>>* = nullptr) { return create_mirror_view_and_copy( Kokkos::view_alloc(typename Space::memory_space{}, name), src); } diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp index 805411a699ec28854971dee696f8cd430884f04c..9588d289a9ca19b8add9e01a6e95acfd9b36e43f 100644 --- a/packages/kokkos/core/src/Kokkos_Core.hpp +++ b/packages/kokkos/core/src/Kokkos_Core.hpp @@ -46,14 +46,15 @@ #include <Kokkos_Half.hpp> #include <Kokkos_AnonymousSpace.hpp> -#include <Kokkos_LogicalSpaces.hpp> #include <Kokkos_Pair.hpp> -#include <Kokkos_MinMaxClamp.hpp> +#include <Kokkos_Clamp.hpp> +#include <Kokkos_MinMax.hpp> #include <Kokkos_MathematicalConstants.hpp> #include <Kokkos_MathematicalFunctions.hpp> #include <Kokkos_MathematicalSpecialFunctions.hpp> #include <Kokkos_NumericTraits.hpp> #include <Kokkos_BitManipulation.hpp> +#include <Kokkos_Swap.hpp> #include <Kokkos_MemoryPool.hpp> #include <Kokkos_Array.hpp> #include <Kokkos_View.hpp> @@ -62,7 +63,9 @@ #include <Kokkos_hwloc.hpp> #include <Kokkos_Timer.hpp> #include <Kokkos_Tuners.hpp> +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include <Kokkos_TaskScheduler.hpp> +#endif #include <Kokkos_Complex.hpp> #include <Kokkos_CopyViews.hpp> #include <impl/Kokkos_TeamMDPolicy.hpp> @@ -101,6 +104,7 @@ void declare_configuration_metadata(const std::string& category, [[nodiscard]] bool is_finalized() noexcept; [[nodiscard]] int device_id() noexcept; +[[nodiscard]] int num_devices() noexcept; [[nodiscard]] int num_threads() noexcept; bool show_warnings() noexcept; @@ -246,9 +250,9 @@ class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard { } ScopeGuard& operator=(const ScopeGuard&) = delete; - ScopeGuard& operator=(ScopeGuard&&) = delete; - ScopeGuard(const ScopeGuard&) = delete; - ScopeGuard(ScopeGuard&&) = delete; + ScopeGuard& operator=(ScopeGuard&&) = delete; + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard(ScopeGuard&&) = delete; }; } // namespace Kokkos @@ -279,7 +283,7 @@ std::vector<ExecSpace> partition_space(ExecSpace const& space, "Kokkos Error: partition_space expects an Execution Space as " "first argument"); static_assert( - std::is_arithmetic<T>::value, + std::is_arithmetic_v<T>, "Kokkos Error: partitioning arguments must be integers or floats"); std::vector<ExecSpace> instances(weights.size()); @@ -300,9 +304,6 @@ std::vector<ExecSpace> partition_space(ExecSpace const& space, // implementation of the RAII wrapper is using Kokkos::single. #include <Kokkos_AcquireUniqueTokenImpl.hpp> -// Specializations required after core definitions -#include <KokkosCore_Config_PostInclude.hpp> - //---------------------------------------------------------------------------- // Redefinition of the macros min and max if we pushed them at entry of // Kokkos_Core.hpp diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp index 44f1c5b42f4d48a50909ba66c8e9fab02d249ae0..5dbe5714293f255e66678b4ea6e4944443e98836 100644 --- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -30,10 +30,6 @@ #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Utilities.hpp> -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -#include <Kokkos_MasterLock.hpp> -#endif - //---------------------------------------------------------------------------- // Have assumed a 64-bit build (8-byte pointers) throughout the code base. // 32-bit build allowed but unsupported. @@ -75,9 +71,6 @@ template <class ExecutionSpace, class MemorySpace> struct Device; // forward declare here so that backend initializer calls can use it. -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments; -#endif class InitializationSettings; } // namespace Kokkos @@ -113,8 +106,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::SYCL; +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::OpenACC; @@ -129,7 +121,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -169,7 +161,7 @@ using SharedSpace = CudaUVMSpace; using SharedSpace = HIPManagedSpace; #define KOKKOS_HAS_SHARED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) -using SharedSpace = Experimental::SYCLSharedUSMSpace; +using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) @@ -191,7 +183,7 @@ using SharedHostPinnedSpace = CudaHostPinnedSpace; using SharedHostPinnedSpace = HIPHostPinnedSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) - using SharedHostPinnedSpace = Experimental::SYCLHostUSMSpace; + using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) using SharedHostPinnedSpace = HostSpace; @@ -262,12 +254,6 @@ KOKKOS_FUNCTION void runtime_check_memory_access_violation( } } // namespace Impl - -namespace Experimental { -template <class, class, class, class> -class LogicalMemorySpace; -} - } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp index 92931b584952b24eb2a5904dc2fcb97bf1ac770a..69223b641289bae8a9dccb820b0c3a1f6d3dd652 100644 --- a/packages/kokkos/core/src/Kokkos_Crs.hpp +++ b/packages/kokkos/core/src/Kokkos_Crs.hpp @@ -84,12 +84,12 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view @@ -148,7 +148,7 @@ class GetCrsTransposeCounts { public: KOKKOS_INLINE_FUNCTION - void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + void operator()(index_type i) const { atomic_inc(&out[in.entries(i)]); } GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) : in(arg_in), out(arg_out) { using policy_type = RangePolicy<index_type, execution_space>; @@ -345,7 +345,7 @@ struct CountAndFill : public CountAndFillBase<CrsType, Functor> { closure.execute(); } auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, - this->m_counts); + this->m_counts); this->m_counts = counts_type(); this->m_crs.entries = entries_type("entries", nentries); { diff --git a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp index ae28805a42ee1acaad768f2cc634c48107465c2d..8af10b2a409badda531fcd1f18a1bc8a084a46eb 100644 --- a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp +++ b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp @@ -54,8 +54,8 @@ struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> { } // namespace Impl struct nonesuch : private Impl::nonesuch_base { - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; void operator=(nonesuch const&) = delete; }; @@ -81,7 +81,7 @@ inline constexpr bool is_detected_v = is_detected<Op, Args...>::value; template <class Expected, template <class...> class Op, class... Args> inline constexpr bool is_detected_exact_v = - is_detected_exact<Expected, Op, Args...>::value; + is_detected_exact<Expected, Op, Args...>::value; // NOLINT template <class Expected, template <class...> class Op, class... Args> inline constexpr bool is_detected_convertible_v = diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp index ae1585a4989f2db19816587893a88218f10e2553..dd7ce5ce21f6b8de4849a00fc5ee459be1af71ca 100644 --- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -27,7 +27,11 @@ static_assert(false, #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_AnalyzePolicy.hpp> #include <Kokkos_Concepts.hpp> +#include <Kokkos_TypeInfo.hpp> +#ifndef KOKKOS_ENABLE_IMPL_TYPEINFO #include <typeinfo> +#endif +#include <limits> //---------------------------------------------------------------------------- @@ -39,7 +43,12 @@ struct ParallelReduceTag {}; struct ChunkSize { int value; + explicit ChunkSize(int value_) : value(value_) {} +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT("ChunkSize should be constructed explicitly.") ChunkSize(int value_) : value(value_) {} +#endif }; /** \brief Execution policy for work over a range of an integral type. @@ -114,62 +123,67 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> { m_granularity_mask(0) {} /** \brief Total range */ + template <typename IndexType1, typename IndexType2, + std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> && + std::is_convertible_v<IndexType2, member_type>), + bool> = false> + inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end) + : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} + + /** \brief Total range */ + template <typename IndexType1, typename IndexType2, + std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> && + std::is_convertible_v<IndexType2, member_type>), + bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end) + const IndexType1 work_begin, const IndexType2 work_end) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); set_auto_chunk_size(); } - /** \brief Total range */ - inline RangePolicy(const member_type work_begin, const member_type work_end) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - } - - /** \brief Total range */ - template <class... Args> - inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end, - Args... args) + template <typename IndexType1, typename IndexType2, + std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> && + std::is_convertible_v<IndexType2, member_type>), + bool> = false> + RangePolicy(const typename traits::execution_space& work_space, + const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { - set_auto_chunk_size(); - set(args...); + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); + set_chunk_size(chunk_size.value); } /** \brief Total range */ - template <class... Args> - inline RangePolicy(const member_type work_begin, const member_type work_end, - Args... args) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - set(args...); - } - - private: - inline void set() {} + template <typename IndexType1, typename IndexType2, typename... Args, + std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> && + std::is_convertible_v<IndexType2, member_type>), + bool> = false> + RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) + : RangePolicy(typename traits::execution_space(), work_begin, work_end, + chunk_size) {} public: - template <class... Args> - inline void set(Args...) { - static_assert( - 0 == sizeof...(Args), - "Kokkos::RangePolicy: unhandled constructor arguments encountered."); - } - - template <class... Args> - inline void set(const ChunkSize& chunksize, Args... args) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead") + inline void set(ChunkSize chunksize) { m_granularity = chunksize.value; m_granularity_mask = m_granularity - 1; - set(args...); } +#endif public: /** \brief return chunk_size */ @@ -186,8 +200,7 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> { /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v<typename traits::execution_space, - Kokkos::Experimental::SYCL>) { + if (std::is_same_v<typename traits::execution_space, Kokkos::SYCL>) { // chunk_size <=1 lets the compiler choose the workgroup size when // launching kernels m_granularity = 1; @@ -218,6 +231,70 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> { m_granularity_mask = m_granularity - 1; } + void check_bounds_validity() { + if (m_end < m_begin) { + std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" + + std::to_string(m_begin) + + ") is greater than the upper bound (" + + std::to_string(m_end) + ").\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + m_begin = 0; + m_end = 0; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } + } + + // To be replaced with std::in_range (c++20) + template <typename IndexType> + static void check_conversion_safety([[maybe_unused]] const IndexType bound) { + // Checking that the round-trip conversion preserves input index value + if constexpr (std::is_convertible_v<member_type, IndexType>) { +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ + defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_arithmetic_v<member_type> && + (std::is_signed_v<IndexType> != + std::is_signed_v<member_type>)) { + // check signed to unsigned + if constexpr (std::is_signed_v<IndexType>) + warn |= (bound < static_cast<IndexType>( + std::numeric_limits<member_type>::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v<member_type>) + warn |= (bound > static_cast<IndexType>( + std::numeric_limits<member_type>::max())); + } + + // check narrowing + warn |= + (static_cast<IndexType>(static_cast<member_type>(bound)) != bound); + + if (warn) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } +#endif + } + } + public: /** \brief Subrange for a partition's rank and size. * @@ -261,6 +338,21 @@ class RangePolicy : public Impl::PolicyTraits<Properties...> { }; }; +RangePolicy() -> RangePolicy<>; + +RangePolicy(int64_t, int64_t) -> RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>; + +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) + -> RangePolicy<>; + +template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>> +RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy<ES>; + +template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>> +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy<ES>; + } // namespace Kokkos //---------------------------------------------------------------------------- @@ -428,24 +520,24 @@ struct PerThreadValue { template <class iType, class... Args> struct ExtractVectorLength { static inline iType value( - std::enable_if_t<std::is_integral<iType>::value, iType> val, Args...) { + std::enable_if_t<std::is_integral_v<iType>, iType> val, Args...) { return val; } - static inline std::enable_if_t<!std::is_integral<iType>::value, int> value( - std::enable_if_t<!std::is_integral<iType>::value, iType>, Args...) { + static inline std::enable_if_t<!std::is_integral_v<iType>, int> value( + std::enable_if_t<!std::is_integral_v<iType>, iType>, Args...) { return 1; } }; template <class iType, class... Args> -inline std::enable_if_t<std::is_integral<iType>::value, iType> -extract_vector_length(iType val, Args...) { +inline std::enable_if_t<std::is_integral_v<iType>, iType> extract_vector_length( + iType val, Args...) { return val; } template <class iType, class... Args> -inline std::enable_if_t<!std::is_integral<iType>::value, int> -extract_vector_length(iType, Args...) { +inline std::enable_if_t<!std::is_integral_v<iType>, int> extract_vector_length( + iType, Args...) { return 1; } @@ -490,7 +582,7 @@ struct ScratchRequest { } }; -// Throws a runtime exception if level is not `0` or `1` +// Causes abnormal program termination if level is not `0` or `1` void team_policy_check_valid_storage_level_argument(int level); /** \brief Execution policy for parallel work over a league of teams of @@ -632,6 +724,57 @@ class TeamPolicy } }; +// Execution space not provided deduces to TeamPolicy<> + +TeamPolicy() -> TeamPolicy<>; + +TeamPolicy(int, int) -> TeamPolicy<>; +TeamPolicy(int, int, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>; + +// DefaultExecutionSpace deduces to TeamPolicy<> + +TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) + -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) + -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, + Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) + -> TeamPolicy<>; + +// ES != DefaultExecutionSpace deduces to TeamPolicy<ES> + +template <typename ES, + typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>> +TeamPolicy(ES const&, int, int) -> TeamPolicy<ES>; + +template <typename ES, + typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>> +TeamPolicy(ES const&, int, int, int) -> TeamPolicy<ES>; + +template <typename ES, + typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy<ES>; + +template <typename ES, + typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy<ES>; + +template <typename ES, + typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>> +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) + -> TeamPolicy<ES>; + +template <typename ES, + typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>> +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy<ES>; + namespace Impl { template <typename iType, class TeamMemberType> @@ -886,9 +1029,9 @@ struct TeamThreadMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> { static constexpr auto par_vector = Impl::TeamMDRangeParVector::NotParVector; static constexpr Iterate direction = - OuterDir == Iterate::Default - ? layout_iterate_type_selector<ArrayLayout>::outer_iteration_pattern - : iter; + OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector< + ArrayLayout>::outer_iteration_pattern + : iter; template <class... Args> KOKKOS_FUNCTION TeamThreadMDRange(TeamHandleType const& team_, Args&&... args) @@ -901,8 +1044,8 @@ struct TeamThreadMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> { }; template <typename TeamHandle, typename... Args> -TeamThreadMDRange(TeamHandle const&, Args&&...) - ->TeamThreadMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>; +KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) + -> TeamThreadMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>; template <typename Rank, typename TeamHandle> struct ThreadVectorMDRange; @@ -922,9 +1065,9 @@ struct ThreadVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> { static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector; static constexpr Iterate direction = - OuterDir == Iterate::Default - ? layout_iterate_type_selector<ArrayLayout>::outer_iteration_pattern - : iter; + OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector< + ArrayLayout>::outer_iteration_pattern + : iter; template <class... Args> KOKKOS_INLINE_FUNCTION ThreadVectorMDRange(TeamHandleType const& team_, @@ -938,8 +1081,8 @@ struct ThreadVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> { }; template <typename TeamHandle, typename... Args> -ThreadVectorMDRange(TeamHandle const&, Args&&...) - ->ThreadVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>; +KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) + -> ThreadVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>; template <typename Rank, typename TeamHandle> struct TeamVectorMDRange; @@ -959,9 +1102,9 @@ struct TeamVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> { static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector; static constexpr Iterate direction = - iter == Iterate::Default - ? layout_iterate_type_selector<ArrayLayout>::outer_iteration_pattern - : iter; + iter == Iterate::Default ? Impl::layout_iterate_type_selector< + ArrayLayout>::outer_iteration_pattern + : iter; template <class... Args> KOKKOS_INLINE_FUNCTION TeamVectorMDRange(TeamHandleType const& team_, @@ -975,15 +1118,24 @@ struct TeamVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> { }; template <typename TeamHandle, typename... Args> -TeamVectorMDRange(TeamHandle const&, Args&&...) - ->TeamVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>; +KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) + -> TeamVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>; template <typename Rank, typename TeamHandle, typename Lambda, typename ReducerValueType> KOKKOS_INLINE_FUNCTION void parallel_reduce( TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/ + !std::is_array_v<ReducerValueType> && + !std::is_pointer_v<ReducerValueType> && + !Kokkos::is_reducer_v<ReducerValueType>, + "Only scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl<Rank>(policy, lambda, val); + policy.team.team_reduce( + Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val}); } template <typename Rank, typename TeamHandle, typename Lambda> @@ -997,7 +1149,29 @@ template <typename Rank, typename TeamHandle, typename Lambda, KOKKOS_INLINE_FUNCTION void parallel_reduce( ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/ + !std::is_array_v<ReducerValueType> && + !std::is_pointer_v<ReducerValueType> && + !Kokkos::is_reducer_v<ReducerValueType>, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl<Rank>(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v<typename TeamHandle::execution_space, + Kokkos::Cuda> +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v<typename TeamHandle::execution_space, + Kokkos::HIP> +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v<typename TeamHandle::execution_space, + Kokkos::SYCL> +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{ + val}); } template <typename Rank, typename TeamHandle, typename Lambda> @@ -1011,7 +1185,31 @@ template <typename Rank, typename TeamHandle, typename Lambda, KOKKOS_INLINE_FUNCTION void parallel_reduce( TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/ + !std::is_array_v<ReducerValueType> && + !std::is_pointer_v<ReducerValueType> && + !Kokkos::is_reducer_v<ReducerValueType>, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl<Rank>(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v<typename TeamHandle::execution_space, + Kokkos::Cuda> +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v<typename TeamHandle::execution_space, + Kokkos::HIP> +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v<typename TeamHandle::execution_space, + Kokkos::SYCL> +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{ + val}); + policy.team.team_reduce( + Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val}); } template <typename Rank, typename TeamHandle, typename Lambda> @@ -1023,15 +1221,21 @@ KOKKOS_INLINE_FUNCTION void parallel_for( namespace Impl { template <typename FunctorType, typename TagType, - bool HasTag = !std::is_void<TagType>::value> + bool HasTag = !std::is_void_v<TagType>> struct ParallelConstructName; template <typename FunctorType, typename TagType> struct ParallelConstructName<FunctorType, TagType, true> { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = + std::string(TypeInfo<std::remove_const_t<FunctorType>>::name()) + + "/" + std::string(TypeInfo<TagType>::name()); +#else default_name = std::string(typeid(FunctorType).name()) + "/" + typeid(TagType).name(); +#endif } } std::string const& get() { @@ -1045,7 +1249,11 @@ template <typename FunctorType, typename TagType> struct ParallelConstructName<FunctorType, TagType, false> { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { - default_name = std::string(typeid(FunctorType).name()); +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = TypeInfo<std::remove_const_t<FunctorType>>::name(); +#else + default_name = typeid(FunctorType).name(); +#endif } } std::string const& get() { diff --git a/packages/kokkos/core/src/Kokkos_Extents.hpp b/packages/kokkos/core/src/Kokkos_Extents.hpp index 7968fab3aaa829b7f5618fe59ff8b2062b3d8710..7d1f8c755d780751978f63be936ffb7e0947b364 100644 --- a/packages/kokkos/core/src/Kokkos_Extents.hpp +++ b/packages/kokkos/core/src/Kokkos_Extents.hpp @@ -25,33 +25,40 @@ static_assert(false, #include <cstddef> #include <type_traits> #include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include <mdspan/mdspan.hpp> +#else +#include <limits> +#endif namespace Kokkos { -namespace Experimental { -constexpr ptrdiff_t dynamic_extent = -1; +#ifndef KOKKOS_ENABLE_IMPL_MDSPAN +constexpr size_t dynamic_extent = std::numeric_limits<size_t>::max(); +#endif -template <ptrdiff_t... ExtentSpecs> +namespace Experimental { + +template <size_t... ExtentSpecs> struct Extents { /* TODO @enhancement flesh this out more */ }; -template <class Exts, ptrdiff_t NewExtent> +template <class Exts, size_t NewExtent> struct PrependExtent; -template <ptrdiff_t... Exts, ptrdiff_t NewExtent> +template <size_t... Exts, size_t NewExtent> struct PrependExtent<Extents<Exts...>, NewExtent> { using type = Extents<NewExtent, Exts...>; }; -template <class Exts, ptrdiff_t NewExtent> +template <class Exts, size_t NewExtent> struct AppendExtent; -template <ptrdiff_t... Exts, ptrdiff_t NewExtent> +template <size_t... Exts, size_t NewExtent> struct AppendExtent<Extents<Exts...>, NewExtent> { using type = Extents<Exts..., NewExtent>; }; - } // end namespace Experimental namespace Impl { @@ -75,33 +82,32 @@ struct _parse_impl { // We have to treat the case of int**[x] specially, since it *doesn't* go // backwards -template <class T, ptrdiff_t... ExtentSpec> +template <class T, size_t... ExtentSpec> struct _parse_impl<T*, Kokkos::Experimental::Extents<ExtentSpec...>, std::enable_if_t<_all_remaining_extents_dynamic<T>::value>> - : _parse_impl<T, Kokkos::Experimental::Extents< - Kokkos::Experimental::dynamic_extent, ExtentSpec...>> { -}; + : _parse_impl<T, Kokkos::Experimental::Extents<Kokkos::dynamic_extent, + ExtentSpec...>> {}; // int*(*[x])[y] should still work also (meaning int[][x][][y]) -template <class T, ptrdiff_t... ExtentSpec> +template <class T, size_t... ExtentSpec> struct _parse_impl< T*, Kokkos::Experimental::Extents<ExtentSpec...>, std::enable_if_t<!_all_remaining_extents_dynamic<T>::value>> { using _next = Kokkos::Experimental::AppendExtent< typename _parse_impl<T, Kokkos::Experimental::Extents<ExtentSpec...>, void>::type, - Kokkos::Experimental::dynamic_extent>; + Kokkos::dynamic_extent>; using type = typename _next::type; }; -template <class T, ptrdiff_t... ExtentSpec, unsigned N> +template <class T, size_t... ExtentSpec, unsigned N> struct _parse_impl<T[N], Kokkos::Experimental::Extents<ExtentSpec...>, void> - : _parse_impl< - T, Kokkos::Experimental::Extents<ExtentSpec..., - ptrdiff_t(N)> // TODO @pedantic this - // could be a - // narrowing cast - > {}; + : _parse_impl<T, + Kokkos::Experimental::Extents<ExtentSpec..., + size_t(N)> // TODO @pedantic + // this could be a + // narrowing cast + > {}; } // end namespace _parse_view_extents_impl @@ -111,38 +117,34 @@ struct ParseViewExtents { DataType, Kokkos::Experimental::Extents<>>::type; }; -template <class ValueType, ptrdiff_t Ext> +template <class ValueType, size_t Ext> struct ApplyExtent { using type = ValueType[Ext]; }; template <class ValueType> -struct ApplyExtent<ValueType, Kokkos::Experimental::dynamic_extent> { +struct ApplyExtent<ValueType, Kokkos::dynamic_extent> { using type = ValueType*; }; -template <class ValueType, unsigned N, ptrdiff_t Ext> +template <class ValueType, unsigned N, size_t Ext> struct ApplyExtent<ValueType[N], Ext> { using type = typename ApplyExtent<ValueType, Ext>::type[N]; }; -template <class ValueType, ptrdiff_t Ext> +template <class ValueType, size_t Ext> struct ApplyExtent<ValueType*, Ext> { - using type = ValueType * [Ext]; + using type = ValueType* [Ext]; }; template <class ValueType> -struct ApplyExtent<ValueType*, Kokkos::Experimental::dynamic_extent> { - using type = - typename ApplyExtent<ValueType, - Kokkos::Experimental::dynamic_extent>::type*; +struct ApplyExtent<ValueType*, dynamic_extent> { + using type = typename ApplyExtent<ValueType, dynamic_extent>::type*; }; template <class ValueType, unsigned N> -struct ApplyExtent<ValueType[N], Kokkos::Experimental::dynamic_extent> { - using type = - typename ApplyExtent<ValueType, - Kokkos::Experimental::dynamic_extent>::type[N]; +struct ApplyExtent<ValueType[N], dynamic_extent> { + using type = typename ApplyExtent<ValueType, dynamic_extent>::type[N]; }; } // end namespace Impl diff --git a/packages/kokkos/core/src/Kokkos_Future.hpp b/packages/kokkos/core/src/Kokkos_Future.hpp index 0b3a153de8c4a450377415c05f221064b0feb55b..c26d08be1cffa7811f2d0f913981df98052df630 100644 --- a/packages/kokkos/core/src/Kokkos_Future.hpp +++ b/packages/kokkos/core/src/Kokkos_Future.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_FUTURE_HPP #define KOKKOS_FUTURE_HPP @@ -41,13 +47,19 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // For now, hack this in as a partial specialization // TODO @tasking @cleanup Make this the "normal" class template and make the old // code the specialization template <typename ValueType, typename ExecutionSpace, typename QueueType> -class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> { +class KOKKOS_DEPRECATED + BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> { public: using value_type = ValueType; using execution_space = ExecutionSpace; @@ -244,7 +256,7 @@ class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> { //////////////////////////////////////////////////////////////////////////////// template <typename ValueType, typename Scheduler> -class BasicFuture { +class KOKKOS_DEPRECATED BasicFuture { private: template <typename, typename> friend class BasicTaskScheduler; @@ -413,13 +425,13 @@ class BasicFuture { // Is a Future with the given execution space template <typename, typename ExecSpace = void> -struct is_future : public std::false_type {}; +struct KOKKOS_DEPRECATED is_future : public std::false_type {}; template <typename ValueType, typename Scheduler, typename ExecSpace> -struct is_future<BasicFuture<ValueType, Scheduler>, ExecSpace> +struct KOKKOS_DEPRECATED is_future<BasicFuture<ValueType, Scheduler>, ExecSpace> : std::bool_constant< - std::is_same<ExecSpace, typename Scheduler::execution_space>::value || - std::is_void<ExecSpace>::value> {}; + std::is_same_v<ExecSpace, typename Scheduler::execution_space> || + std::is_void_v<ExecSpace>> {}; //////////////////////////////////////////////////////////////////////////////// // END OLD CODE @@ -432,8 +444,8 @@ class ResolveFutureArgOrder { private: enum { Arg1_is_space = Kokkos::is_space<Arg1>::value }; enum { Arg2_is_space = Kokkos::is_space<Arg2>::value }; - enum { Arg1_is_value = !Arg1_is_space && !std::is_void<Arg1>::value }; - enum { Arg2_is_value = !Arg2_is_space && !std::is_void<Arg2>::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_void_v<Arg1> }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_void_v<Arg2> }; static_assert(!(Arg1_is_space && Arg2_is_space), "Future cannot be given two spaces"); @@ -463,10 +475,15 @@ class ResolveFutureArgOrder { * */ template <class Arg1 = void, class Arg2 = void> -using Future = typename Impl::ResolveFutureArgOrder<Arg1, Arg2>::type; +using Future KOKKOS_DEPRECATED = + typename Impl::ResolveFutureArgOrder<Arg1, Arg2>::type; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_Graph.hpp b/packages/kokkos/core/src/Kokkos_Graph.hpp index 643bdcc02ccc904c80ae3fe0b7e1712de18af819..05d774ac61a2eeb7fbf28bd13c5a3c8a38622f5f 100644 --- a/packages/kokkos/core/src/Kokkos_Graph.hpp +++ b/packages/kokkos/core/src/Kokkos_Graph.hpp @@ -86,10 +86,21 @@ struct [[nodiscard]] Graph { return m_impl_ptr->get_execution_space(); } - void submit() const { + void instantiate() { KOKKOS_EXPECTS(bool(m_impl_ptr)) - (*m_impl_ptr).submit(); + (*m_impl_ptr).instantiate(); } + + void submit(const execution_space& exec) const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(exec); + } + + void submit() const { submit(get_execution_space()); } + + decltype(auto) native_graph(); + + decltype(auto) native_graph_exec(); }; // </editor-fold> end Graph }}}1 @@ -135,22 +146,68 @@ Graph<ExecutionSpace> create_graph(ExecutionSpace ex, Closure&& arg_closure) { // function template injection works. auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); // Invoke the user's graph construction closure - ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + ((Closure&&)arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); // and given them back the graph // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) return rv; } +template <class ExecutionSpace = DefaultExecutionSpace> +std::enable_if_t<Kokkos::is_execution_space_v<ExecutionSpace>, + Graph<ExecutionSpace>> +create_graph(ExecutionSpace exec = ExecutionSpace{}) { + return Kokkos::Impl::GraphAccess::construct_graph(std::move(exec)); +} + template < class ExecutionSpace = DefaultExecutionSpace, class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> -Graph<ExecutionSpace> create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +std::enable_if_t< + !Kokkos::is_execution_space_v<Kokkos::Impl::remove_cvref_t<Closure>>, + Graph<ExecutionSpace>> +create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); } // </editor-fold> end create_graph }}}1 //============================================================================== +template <class ExecutionSpace> +decltype(auto) Graph<ExecutionSpace>::native_graph() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::Cuda>) { + return m_impl_ptr->cuda_graph(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::HIP>) { + return m_impl_ptr->hip_graph(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::SYCL>) { + return m_impl_ptr->sycl_graph(); + } +#endif +} + +template <class ExecutionSpace> +decltype(auto) Graph<ExecutionSpace>::native_graph_exec() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::Cuda>) { + return m_impl_ptr->cuda_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::HIP>) { + return m_impl_ptr->hip_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v<ExecutionSpace, Kokkos::SYCL>) { + return m_impl_ptr->sycl_graph_exec(); + } +#endif +} + } // end namespace Experimental } // namespace Kokkos @@ -163,10 +220,13 @@ Graph<ExecutionSpace> create_graph(Closure&& arg_closure) { #include <Cuda/Kokkos_Cuda_Graph_Impl.hpp> #if defined(KOKKOS_ENABLE_HIP) // The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#if defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) #include <HIP/Kokkos_HIP_Graph_Impl.hpp> #endif #endif +#ifdef SYCL_EXT_ONEAPI_GRAPH +#include <SYCL/Kokkos_SYCL_Graph_Impl.hpp> +#endif #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH diff --git a/packages/kokkos/core/src/Kokkos_GraphNode.hpp b/packages/kokkos/core/src/Kokkos_GraphNode.hpp index 2a4e2cf6414a55f8283641dcc10ca8a71f16e2dd..a0a60c07d094ba666e7efd1d8b96d68ff6ae1492 100644 --- a/packages/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/packages/kokkos/core/src/Kokkos_GraphNode.hpp @@ -48,7 +48,7 @@ class GraphNodeRef { // intended to be SFINAE-safe, so do validation before you instantiate. static_assert( - std::is_same<Predecessor, TypeErasedTag>::value || + std::is_same_v<Predecessor, TypeErasedTag> || Kokkos::Impl::is_specialization_of<Predecessor, GraphNodeRef>::value, "Invalid predecessor template parameter given to GraphNodeRef"); @@ -56,7 +56,7 @@ class GraphNodeRef { Kokkos::is_execution_space<ExecutionSpace>::value, "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same<Predecessor, TypeErasedTag>::value || + static_assert(std::is_same_v<Predecessor, TypeErasedTag> || Kokkos::Impl::is_graph_kernel<Kernel>::value, "Invalid kernel template parameter given to GraphNodeRef"); @@ -151,7 +151,7 @@ class GraphNodeRef { typename return_t::node_impl_t>( m_node_impl->execution_space_instance(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, - (NextKernelDeduced &&) arg_kernel, + (NextKernelDeduced&&)arg_kernel, // *this is the predecessor Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); @@ -184,10 +184,10 @@ class GraphNodeRef { // <editor-fold desc="rule of 6 ctors"> {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; ~GraphNodeRef() = default; @@ -197,19 +197,19 @@ class GraphNodeRef { //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // <editor-fold desc="Type-erasing converting ctor and assignment"> {{{3 - template < - class OtherKernel, class OtherPredecessor, - std::enable_if_t< - // Not a copy/move constructor - !std::is_same<GraphNodeRef, GraphNodeRef<execution_space, OtherKernel, - OtherPredecessor>>::value && - // must be an allowed type erasure of the kernel - Kokkos::Impl::is_compatible_type_erasure<OtherKernel, - graph_kernel>::value && - // must be an allowed type erasure of the predecessor - Kokkos::Impl::is_compatible_type_erasure< - OtherPredecessor, graph_predecessor>::value, - int> = 0> + template <class OtherKernel, class OtherPredecessor, + std::enable_if_t< + // Not a copy/move constructor + !std::is_same_v<GraphNodeRef, + GraphNodeRef<execution_space, OtherKernel, + OtherPredecessor>> && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure< + OtherKernel, graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> /* implicit */ GraphNodeRef( GraphNodeRef<execution_space, OtherKernel, OtherPredecessor> const& other) @@ -257,7 +257,7 @@ class GraphNodeRef { //|| policy_t::execution_space_is_defaulted, "Execution Space mismatch between execution policy and graph"); - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using next_policy_t = decltype(policy); @@ -266,8 +266,8 @@ class GraphNodeRef { std::decay_t<Functor>, Kokkos::ParallelForTag>; return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), - (Functor &&) functor, - (Policy &&) policy}); + (Functor&&)functor, + (Policy&&)policy}); } template < @@ -280,8 +280,7 @@ class GraphNodeRef { int> = 0> auto then_parallel_for(Policy&& policy, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor<Functor> - return this->then_parallel_for("", (Policy &&) policy, - (Functor &&) functor); + return this->then_parallel_for("", (Policy&&)policy, (Functor&&)functor); } template <class Functor> @@ -290,13 +289,13 @@ class GraphNodeRef { // needs to static assert constraint: DataParallelFunctor<Functor> return this->then_parallel_for(std::move(name), Kokkos::RangePolicy<execution_space>(0, n), - (Functor &&) functor); + (Functor&&)functor); } template <class Functor> auto then_parallel_for(std::size_t n, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor<Functor> - return this->then_parallel_for("", n, (Functor &&) functor); + return this->then_parallel_for("", n, (Functor&&)functor); } // </editor-fold> end then_parallel_for }}}2 @@ -359,6 +358,23 @@ class GraphNodeRef { Kokkos::is_reducer<return_type_remove_cvref>::value, "Output argument to parallel reduce in a graph must be a " "View or a Reducer"); + + if constexpr (Kokkos::is_reducer_v<return_type_remove_cvref>) { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename return_type_remove_cvref:: + result_view_type::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } else { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, + typename return_type_remove_cvref::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } + using return_type = // Yes, you do really have to do this... std::conditional_t<Kokkos::is_reducer<return_type_remove_cvref>::value, @@ -373,7 +389,7 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -399,7 +415,7 @@ class GraphNodeRef { return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy &&) policy, + functor_reducer, (Policy&&)policy, return_value_adapter::return_value(return_value, functor)}); } @@ -413,9 +429,9 @@ class GraphNodeRef { int> = 0> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy &&) arg_policy, - (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", (Policy&&)arg_policy, + (Functor&&)functor, + (ReturnType&&)return_value); } template <class Functor, class ReturnType> @@ -425,15 +441,15 @@ class GraphNodeRef { ReturnType&& return_value) const { return this->then_parallel_reduce( std::move(label), Kokkos::RangePolicy<execution_space>{0, idx_end}, - (Functor &&) functor, (ReturnType &&) return_value); + (Functor&&)functor, (ReturnType&&)return_value); } template <class Functor, class ReturnType> auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", idx_end, (Functor&&)functor, + (ReturnType&&)return_value); } // </editor-fold> end then_parallel_reduce }}}2 diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp deleted file mode 100644 index 369b7bafb7b8f54253625b0290502e45f42ded58..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp +++ /dev/null @@ -1,308 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include <Kokkos_Macros.hpp> -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_HBWSPACE_HPP -#define KOKKOS_HBWSPACE_HPP - -#include <Kokkos_Macros.hpp> -#ifdef KOKKOS_ENABLE_HBWSPACE - -#include <Kokkos_HostSpace.hpp> - -namespace Kokkos { - -namespace Experimental { - -/// \class HBWSpace -/// \brief Memory management for host memory. -/// -/// HBWSpace is a memory space that governs host memory. "Host" -/// memory means the usual CPU-accessible memory. -class HBWSpace { - public: - //! Tag this class as a kokkos memory space - using memory_space = HBWSpace; - using size_type = size_t; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - using execution_space = Kokkos::DefaultHostExecutionSpace; - - //! This memory space preferred device_type - using device_type = Kokkos::Device<execution_space, memory_space>; - - /**\brief Default memory space instance */ - HBWSpace(); - HBWSpace(const HBWSpace& rhs) = default; - HBWSpace& operator=(const HBWSpace&) = default; - ~HBWSpace() = default; - - /**\brief Non-default memory space instance to choose allocation mechansim, - * if available */ - - enum AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; - - explicit HBWSpace(const AllocationMechanism&); - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const; - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - private: - template <class, class, class, class> - friend class LogicalMemorySpace; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - - public: - /**\brief Return Name of the MemorySpace */ - static constexpr const char* name() { return "HBW"; } - - private: - AllocationMechanism m_alloc_mech; - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::HBWSpace, void>; -}; - -} // namespace Experimental - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void> - : public SharedAllocationRecord<void, void> { - private: - friend Kokkos::Experimental::HBWSpace; - - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase*); - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HBWSpace instance */ - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::HBWSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, - const Kokkos::Experimental::HBWSpace&, - bool detail = false); -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -static_assert( - Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HBWSpace, - Kokkos::Experimental::HBWSpace>::assignable, - ""); - -template <> -struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::Experimental::HBWSpace> { - enum : bool { assignable = true }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -template <> -struct MemorySpaceAccess<Kokkos::Experimental::HBWSpace, Kokkos::HostSpace> { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace, - DefaultHostExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template <class ExecutionSpace> -struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace, - ExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, " - "Kokkos::Experimental::HBWSpace,ExecutionSpace::DeepCopy: fence " - "before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); - } -}; - -template <> -struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, - DefaultHostExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template <class ExecutionSpace> -struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, " - "ExecutionSpace>::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(copy_space, dst, src, n); - } -}; - -template <> -struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, - DefaultHostExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template <class ExecutionSpace> -struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, " - "ExecutionSpace>::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); - } -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif -#endif // #define KOKKOS_HBWSPACE_HPP diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp index 90d140406374662bd005bf8959b61d7232e15d1a..706586826f48c1d81aa6455812185eeacbb51cd8 100644 --- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp @@ -37,7 +37,6 @@ static_assert(false, #include <impl/Kokkos_Tools.hpp> #include "impl/Kokkos_HostSpace_deepcopy.hpp" -#include <impl/Kokkos_MemorySpace.hpp> /*--------------------------------------------------------------------------*/ @@ -64,10 +63,10 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device<execution_space, memory_space>; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; + HostSpace() = default; + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; HostSpace& operator=(const HostSpace&) = default; ~HostSpace() = default; @@ -75,18 +74,35 @@ class HostSpace { /**\brief Non-default memory space instance to choose allocation mechansim, * if available */ - enum KOKKOS_DEPRECATED AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; +#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100 + // We see deprecation warnings even when not using the deprecated + // HostSpace constructor below when using gcc before release 11. + enum +#else + enum KOKKOS_DEPRECATED +#endif + AllocationMechanism { + STD_MALLOC, + POSIX_MEMALIGN, + POSIX_MMAP, + INTEL_MM_ALLOC + }; KOKKOS_DEPRECATED explicit HostSpace(const AllocationMechanism&); #endif /**\brief Allocate untracked memory in the space */ + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -97,10 +113,6 @@ class HostSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template <class, class, class, class> - friend class Kokkos::Experimental::LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -111,13 +123,11 @@ class HostSpace { const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - public: /**\brief Return Name of the MemorySpace */ static constexpr const char* name() { return m_name; } private: static constexpr const char* m_name = "Host"; - friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; }; } // namespace Kokkos @@ -129,8 +139,7 @@ namespace Kokkos { namespace Impl { static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HostSpace>::assignable, - ""); + Kokkos::HostSpace>::assignable); template <typename S> struct HostMirror { @@ -166,75 +175,7 @@ struct HostMirror { //---------------------------------------------------------------------------- -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord<Kokkos::HostSpace, void> - : public SharedAllocationRecordCommon<Kokkos::HostSpace> { - private: - friend Kokkos::HostSpace; - friend class SharedAllocationRecordCommon<Kokkos::HostSpace>; - - using base_t = SharedAllocationRecordCommon<Kokkos::HostSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HostSpace instance */ - static RecordBase s_root_record; -#endif - - Kokkos::HostSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } -}; - -} // namespace Impl - -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HostSpace); //---------------------------------------------------------------------------- @@ -242,18 +183,6 @@ namespace Kokkos { namespace Impl { -template <> -struct DeepCopy<HostSpace, HostSpace, DefaultHostExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy_async(exec, dst, src, n); - } -}; - template <class ExecutionSpace> struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { @@ -261,10 +190,15 @@ struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> { } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, " - "ExecutionSpace>::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); + if constexpr (!Kokkos::SpaceAccessibility<ExecutionSpace, + Kokkos::HostSpace>::accessible) { + exec.fence( + "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, " + "ExecutionSpace>::DeepCopy: fence before copy"); + hostspace_parallel_deepcopy_async(dst, src, n); + } else { + hostspace_parallel_deepcopy_async(exec, dst, src, n); + } } }; diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp index ca4d956784c914ac049d64576996e7f69fbd0e6b..a760e7054a1690e44ce33a6240c5b770e470027a 100644 --- a/packages/kokkos/core/src/Kokkos_Layout.hpp +++ b/packages/kokkos/core/src/Kokkos_Layout.hpp @@ -52,13 +52,17 @@ struct LayoutLeft { using array_layout = LayoutLeft; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -69,7 +73,8 @@ struct LayoutLeft { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride(KOKKOS_IMPL_CTOR_DEFAULT_ARG) {} friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -101,13 +106,17 @@ struct LayoutRight { using array_layout = LayoutRight; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -118,7 +127,8 @@ struct LayoutRight { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{KOKKOS_IMPL_CTOR_DEFAULT_ARG} {} friend bool operator==(const LayoutRight& left, const LayoutRight& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -144,10 +154,10 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride&&) = default; /** \brief Compute strides from ordered dimensions. * @@ -191,8 +201,8 @@ struct LayoutStride { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, - S4, S5, S6, S7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{S0, S1, S2, S3, S4, S5, S6, S7} {} friend bool operator==(const LayoutStride& left, const LayoutStride& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -217,81 +227,12 @@ enum class Iterate { Right // Right indices stride fastest }; -// To check for LayoutTiled -// This is to hide extra compile-time 'identifier' info within the LayoutTiled -// class by not relying on template specialization to include the ArgN*'s -template <typename LayoutTiledCheck, class Enable = void> -struct is_layouttiled : std::false_type {}; - -template <typename LayoutTiledCheck> -struct is_layouttiled<LayoutTiledCheck, - std::enable_if_t<LayoutTiledCheck::is_array_layout_tiled>> - : std::true_type {}; - -namespace Experimental { - -/// LayoutTiled -// Must have Rank >= 2 -template < - Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, - unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0, - bool IsPowerOfTwo = - (Kokkos::Impl::is_integral_power_of_two(ArgN0) && - Kokkos::Impl::is_integral_power_of_two(ArgN1) && - (Kokkos::Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0)) && - (Kokkos::Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0)))> -struct LayoutTiled { - static_assert(IsPowerOfTwo, - "LayoutTiled must be given power-of-two tile dimensions"); - - using array_layout = LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, - ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo>; - static constexpr Iterate outer_pattern = OuterP; - static constexpr Iterate inner_pattern = InnerP; - - enum { N0 = ArgN0 }; - enum { N1 = ArgN1 }; - enum { N2 = ArgN2 }; - enum { N3 = ArgN3 }; - enum { N4 = ArgN4 }; - enum { N5 = ArgN5 }; - enum { N6 = ArgN6 }; - enum { N7 = ArgN7 }; - - size_t dimension[ARRAY_LAYOUT_MAX_RANK]; - - enum : bool { is_extent_constructible = true }; - - LayoutTiled(LayoutTiled const&) = default; - LayoutTiled(LayoutTiled&&) = default; - LayoutTiled& operator=(LayoutTiled const&) = default; - LayoutTiled& operator=(LayoutTiled&&) = default; - - KOKKOS_INLINE_FUNCTION - explicit constexpr LayoutTiled(size_t argN0 = 0, size_t argN1 = 0, - size_t argN2 = 0, size_t argN3 = 0, - size_t argN4 = 0, size_t argN5 = 0, - size_t argN6 = 0, size_t argN7 = 0) - : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {} - - friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) { - for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) - if (left.dimension[rank] != right.dimension[rank]) return false; - return true; - } - - friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) { - return !(left == right); - } -}; - -} // namespace Experimental +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template <typename Layout, class Enable = void> +struct KOKKOS_DEPRECATED is_layouttiled : std::false_type {}; +#endif +namespace Impl { // For use with view_copy template <typename... Layout> struct layout_iterate_type_selector { @@ -320,42 +261,13 @@ struct layout_iterate_type_selector<Kokkos::LayoutStride> { static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default; }; +} // namespace Impl -template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, - unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> -struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< - Kokkos::Iterate::Left, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, - ArgN4, ArgN5, ArgN6, ArgN7, true>> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, - unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> -struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< - Kokkos::Iterate::Right, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, - ArgN4, ArgN5, ArgN6, ArgN7, true>> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; -}; - -template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, - unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> -struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< - Kokkos::Iterate::Left, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, - ArgN4, ArgN5, ArgN6, ArgN7, true>> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; - -template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, - unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> -struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< - Kokkos::Iterate::Right, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, - ArgN4, ArgN5, ArgN6, ArgN7, true>> { - static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; - static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; -}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template <typename... Layout> +using layout_iterate_type_selector KOKKOS_DEPRECATED = + Impl::layout_iterate_type_selector<Layout...>; +#endif } // namespace Kokkos diff --git a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp deleted file mode 100644 index 1ee1d2c81fe58a04308fb488afe3f8f19fd7da89..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp +++ /dev/null @@ -1,413 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include <Kokkos_Macros.hpp> -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_LOGICALSPACES_HPP -#define KOKKOS_LOGICALSPACES_HPP - -#include <Kokkos_Macros.hpp> -#include <Kokkos_Core_fwd.hpp> -#include <Kokkos_ScratchSpace.hpp> -#include <impl/Kokkos_MemorySpace.hpp> -#include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_SharedAlloc.hpp> -#include <impl/Kokkos_Profiling.hpp> -#include <cstring> -namespace Kokkos { -namespace Experimental { -struct DefaultMemorySpaceNamer { - static constexpr const char* get_name() { - return "DefaultLogicalMemorySpaceName"; - } -}; - -struct LogicalSpaceSharesAccess { - struct shared_access {}; - struct no_shared_access {}; -}; - -/// \class LogicalMemorySpace -/// \brief -/// -/// LogicalMemorySpace is a space that is identical to another space, -/// but differentiable by name and template argument -template <class BaseSpace, class DefaultBaseExecutionSpace = void, - class Namer = DefaultMemorySpaceNamer, - class SharesAccessWithBase = LogicalSpaceSharesAccess::shared_access> -class LogicalMemorySpace { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - // [DZP] For some reason I don't yet know, using LogicalMemorySpaces - // inside an OpenMPTarget build causes errors in the - // SharedAllocationRecords of other types. This is my way of erroring - // a build if we instantiate a LogicalMemSpace in an OMPTarget build - static_assert(!std::is_same<BaseSpace, BaseSpace>::value, - "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're " - "debugging memory issues"); -#endif - public: - //! Tag this class as a kokkos memory space - using memory_space = LogicalMemorySpace<BaseSpace, DefaultBaseExecutionSpace, - Namer, SharesAccessWithBase>; - using size_type = typename BaseSpace::size_type; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - - using execution_space = - std::conditional_t<std::is_void<DefaultBaseExecutionSpace>::value, - typename BaseSpace::execution_space, - DefaultBaseExecutionSpace>; - - using device_type = Kokkos::Device<execution_space, memory_space>; - - LogicalMemorySpace() = default; - - template <typename... Args> - LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {} - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); - } - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); - } - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); - } - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); - } - - /**\brief Return Name of the MemorySpace */ - constexpr static const char* name() { return Namer::get_name(); } - - private: - BaseSpace underlying_space; - template <class, class, class, class> - friend class LogicalMemorySpace; - friend class Kokkos::Impl::SharedAllocationRecord<memory_space, void>; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - return underlying_space.impl_allocate(arg_label, arg_alloc_size, - arg_logical_size, arg_handle); - } - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, - arg_logical_size, arg_handle); - } -}; -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer, - typename OtherSpace> -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - OtherSpace> { - enum { assignable = MemorySpaceAccess<BaseSpace, OtherSpace>::assignable }; - enum { accessible = MemorySpaceAccess<BaseSpace, OtherSpace>::accessible }; - enum { deepcopy = MemorySpaceAccess<BaseSpace, OtherSpace>::deepcopy }; -}; - -template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer, - typename OtherSpace> -struct MemorySpaceAccess< - OtherSpace, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = MemorySpaceAccess<OtherSpace, BaseSpace>::assignable }; - enum { accessible = MemorySpaceAccess<OtherSpace, BaseSpace>::accessible }; - enum { deepcopy = MemorySpaceAccess<OtherSpace, BaseSpace>::deepcopy }; -}; - -template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer> -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { -template <class BaseSpace, class DefaultBaseExecutionSpace, class Namer, - class SharesAccessSemanticsWithBase> -class SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - SharesAccessSemanticsWithBase>, - void> : public SharedAllocationRecord<void, void> { - private: - using SpaceType = - Kokkos::Experimental::LogicalMemorySpace<BaseSpace, - DefaultBaseExecutionSpace, Namer, - SharesAccessSemanticsWithBase>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase* arg_rec) { - delete static_cast<SharedAllocationRecord*>(arg_rec); - } - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this - * LogicalMemorySpace instance */ - static RecordBase s_root_record; -#endif - - const SpaceType m_space; - - protected: - ~SharedAllocationRecord() { - m_space.deallocate(RecordBase::m_alloc_ptr->m_label, - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size, - (SharedAllocationRecord<void, void>::m_alloc_size - - sizeof(SharedAllocationHeader))); - } - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord<void, void>( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<SpaceType, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast<SharedAllocationRecord<void, void>*>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; - } - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const SpaceType& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void*)nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); - } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy<SpaceType, SpaceType>( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace, " - "void>::reallocate_tracked: fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); - } - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } - } - - static SharedAllocationRecord* get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = SharedAllocationRecord<SpaceType, void>; - - SharedAllocationHeader const* const head = - alloc_ptr ? Header::get_header(alloc_ptr) - : (SharedAllocationHeader*)nullptr; - RecordHost* const record = - head ? static_cast<RecordHost*>(head->m_record) : (RecordHost*)nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , " - "void >::get_record ERROR")); - } - - return record; - } -#ifdef KOKKOS_ENABLE_DEBUG - static void print_records(std::ostream& s, const SpaceType&, - bool detail = false) { - SharedAllocationRecord<void, void>::print_host_accessible_records( - s, "HostSpace", &s_root_record, detail); - } -#else - static void print_records(std::ostream&, const SpaceType&, - bool detail = false) { - (void)detail; - throw_runtime_exception( - "SharedAllocationRecord<HostSpace>::print_records only works " - "with KOKKOS_ENABLE_DEBUG enabled"); - } -#endif -}; -#ifdef KOKKOS_ENABLE_DEBUG -/**\brief Root record for tracked allocations from this LogicalSpace - * instance */ -template <class BaseSpace, class DefaultBaseExecutionSpace, class Namer, - class SharesAccessSemanticsWithBase> -SharedAllocationRecord<void, void> - SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - SharesAccessSemanticsWithBase>, - void>::s_root_record; -#endif - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace, - class SharesAccess, class ExecutionSpace> -struct DeepCopy<Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy<BaseSpace, BaseSpace, ExecutionSpace>(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy<BaseSpace, BaseSpace, ExecutionSpace>(exec, dst, src, n); - } -}; - -template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace, - class SharesAccess, class ExecutionSpace, class SourceSpace> -struct DeepCopy<SourceSpace, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy<SourceSpace, BaseSpace, ExecutionSpace>(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy<SourceSpace, BaseSpace, ExecutionSpace>(exec, dst, src, n); - } -}; - -template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace, - class SharesAccess, class ExecutionSpace, class DestinationSpace> -struct DeepCopy<Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - DestinationSpace, ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy<BaseSpace, DestinationSpace, ExecutionSpace>(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy<BaseSpace, DestinationSpace, ExecutionSpace>(exec, dst, src, n); - } -}; -} // namespace Impl - -} // namespace Kokkos -#endif // KOKKOS_LOGICALSPACES_HPP diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp index 3cf7ac4fa24b666ca983e5e8aa716007fd7efba4..97b78a3c6485b6f1bb337a034fd0688a81e39bc6 100644 --- a/packages/kokkos/core/src/Kokkos_Macros.hpp +++ b/packages/kokkos/core/src/Kokkos_Macros.hpp @@ -27,7 +27,7 @@ * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space - * KOKKOS_ENABLE_SYCL Kokkos::Experimental::SYCL execution space + * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. @@ -55,9 +55,22 @@ #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H #include <KokkosCore_config.h> +#include <impl/Kokkos_DesulAtomicsConfig.hpp> #include <impl/Kokkos_NvidiaGpuArchitectures.hpp> #endif +#if !defined(KOKKOS_ENABLE_CXX17) +#if __has_include(<version>) +#include <version> +#else +#include <ciso646> +#endif +#if defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 10 +#error \ + "Compiling with support for C++20 or later requires a libstdc++ version later than 9" +#endif +#endif + //---------------------------------------------------------------------------- /** Pick up compiler specific #define macros: * @@ -84,11 +97,12 @@ //---------------------------------------------------------------------------- -#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ - !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_INTERNAL_NOT_PARALLEL +#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ + (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC)) +#error Atomics may only be disabled if neither a host parallel nor a device backend is enabled #endif #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA @@ -118,7 +132,7 @@ #define KOKKOS_CLASS_LAMBDA [ =, *this ] #endif -//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. +// #if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. // Intel compiler for host code. @@ -238,10 +252,10 @@ // CLANG compiler macros #if defined(KOKKOS_COMPILER_CLANG) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -259,10 +273,10 @@ // GNU Compiler macros #if defined(KOKKOS_COMPILER_GNU) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -284,7 +298,7 @@ #if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #endif @@ -331,6 +345,10 @@ #define KOKKOS_DEFAULTED_FUNCTION #endif +#if !defined(KOKKOS_DEDUCTION_GUIDE) +#define KOKKOS_DEDUCTION_GUIDE +#endif + #if !defined(KOKKOS_IMPL_HOST_FUNCTION) #define KOKKOS_IMPL_HOST_FUNCTION #endif @@ -339,10 +357,19 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// Temporary solution for SYCL not supporting printf in kernels. -// Might disappear at any point once we have found another solution. -#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__) +// FIXME_OPENACC FIXME_OPENMPTARGET +// Move to setup files once there is more content +// clang-format off +#if defined(KOKKOS_ENABLE_OPENACC) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" +#endif +// clang-format on + +#if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -357,10 +384,14 @@ #define KOKKOS_FORCEINLINE_FUNCTION \ KOKKOS_IMPL_FORCEINLINE_FUNCTION \ __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#define KOKKOS_RELOCATABLE_FUNCTION \ + KOKKOS_IMPL_RELOCATABLE_FUNCTION \ + __attribute__((annotate("KOKKOS_RELOCATABLE_FUNCTION"))) #else #define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION #define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION #define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#define KOKKOS_RELOCATABLE_FUNCTION KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -433,22 +464,6 @@ #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #endif -//---------------------------------------------------------------------------- -// Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) - -#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA -#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL -#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ - defined(KOKKOS_ENABLE_HIP) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU -#else -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST -#endif - -#endif //---------------------------------------------------------------------------- // Remove surrounding parentheses if present @@ -541,14 +556,17 @@ static constexpr bool kokkos_omp_on_host() { return false; } // If compiling with CUDA, we must use relocatable device code to enable the // task policy. +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_ENABLE_CUDA) #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) #define KOKKOS_ENABLE_TASKDAG #endif // FIXME_SYCL Tasks not implemented -#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOS_ENABLE_TASKDAG #endif +#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC @@ -583,8 +601,58 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc)) #endif +// clang-format off +#if defined(__NVCOMPILER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("diag_suppress 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("diag_default 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") +#elif defined(__EDG__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning push") \ + _Pragma("warning disable 1478") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning pop") +#elif defined(__GNUC__) || defined(__clang__) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + _Pragma("warning(push)") \ + _Pragma("warning(disable: 4996)") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("warning(pop)") +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + +#if defined(__NVCOMPILER) +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() \ + _Pragma("diag_suppress code_is_unreachable") \ + _Pragma("diag_suppress initialization_not_reachable") +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() \ + _Pragma("diag_default code_is_unreachable") \ + _Pragma("diag_default initialization_not_reachable") +#else +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() +#endif +// clang-format on + #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] +#ifndef KOKKOS_ENABLE_CXX17 +#define KOKKOS_IMPL_ATTRIBUTE_UNLIKELY [[unlikely]] +#else +#define KOKKOS_IMPL_ATTRIBUTE_UNLIKELY +#endif + #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM) || \ defined(KOKKOS_COMPILER_NVHPC)) && \ diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp deleted file mode 100644 index 1d09617371a62c19f63705e40984646c7584056b..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/Kokkos_MasterLock.hpp +++ /dev/null @@ -1,56 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include <Kokkos_Macros.hpp> -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MASTER_LOCK_HPP -#define KOKKOS_MASTER_LOCK_HPP - -#include <Kokkos_Macros.hpp> - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - -namespace Kokkos { -namespace Experimental { - -// my be used to coordinate work between master instances -// SHOULD NOT be used within a parallel algorithm -// -// This lock should be used with with a scoped lock guard -// i.e. std::unique_lock<Lock>, std::lock_guard -// -// cannot be copied or moved -// has the following functions available -// -// Lock() -// ~Lock() -// -// void lock() -// void unlock() -// bool try_lock() -// -template <typename ExecutionSpace> -class MasterLock; - -} // namespace Experimental -} // namespace Kokkos - -#endif - -#endif // KOKKOS_MASTER_LOCK_HPP diff --git a/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp b/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp index 51a50d347dee3579ddb1008f1d3cb5cda52650e6..1a77f373fd8541a0a32665d5ef3b97198007d891 100644 --- a/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp +++ b/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp @@ -51,24 +51,6 @@ KOKKOS_IMPL_MATH_CONSTANT(phi, 1.618033988749894848204586834365638118L); } // namespace Kokkos::numbers -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Kokkos::Experimental { -using Kokkos::numbers::e_v; -using Kokkos::numbers::egamma_v; -using Kokkos::numbers::inv_pi_v; -using Kokkos::numbers::inv_sqrt3_v; -using Kokkos::numbers::inv_sqrtpi_v; -using Kokkos::numbers::ln10_v; -using Kokkos::numbers::ln2_v; -using Kokkos::numbers::log10e_v; -using Kokkos::numbers::log2e_v; -using Kokkos::numbers::phi_v; -using Kokkos::numbers::pi_v; -using Kokkos::numbers::sqrt2_v; -using Kokkos::numbers::sqrt3_v; -} // namespace Kokkos::Experimental -#endif - #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp index ee64c67b93bd7e61f684a755028fc94bcd836644..19967782e5eeba08b8a135592c007752f16aee51 100644 --- a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp +++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp @@ -92,16 +92,6 @@ using promote_3_t = typename promote_3<T, U, V>::type; #endif #endif -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE -#else -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - /* nothing */ -#endif - #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ @@ -128,13 +118,7 @@ using promote_3_t = typename promote_3<T, U, V>::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast<double>(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } // isinf, isnan, and isinfinite do not work on Windows with CUDA with std:: // getting warnings about calling host function in device function then @@ -151,9 +135,7 @@ using promote_3_t = typename promote_3<T, U, V>::type; KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral_v<T>, bool> FUNC( \ T x) { \ return ::FUNC(static_cast<double>(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #else #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ KOKKOS_INLINE_FUNCTION bool FUNC(float x) { \ @@ -173,9 +155,7 @@ using promote_3_t = typename promote_3<T, U, V>::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast<double>(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #endif #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ @@ -218,16 +198,10 @@ using promote_3_t = typename promote_3<T, U, V>::type; long double> \ FUNC(T1 x, T2 y) { \ using Promoted = Kokkos::Impl::promote_2_t<T1, T2>; \ - static_assert(std::is_same_v<Promoted, long double>, ""); \ + static_assert(std::is_same_v<Promoted, long double>); \ using std::FUNC; \ return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) { \ @@ -303,19 +277,25 @@ KOKKOS_INLINE_FUNCTION long long abs(long long n) { #endif } KOKKOS_INLINE_FUNCTION float abs(float x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } KOKKOS_INLINE_FUNCTION double abs(double x) { +#ifdef KOKKOS_ENABLE_SYCL + return sycl::fabs(x); // sycl::abs is only provided for integral types +#else using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs; return abs(x); +#endif } inline long double abs(long double x) { using std::abs; return abs(x); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { using ::Kokkos::abs; }) KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) @@ -336,12 +316,6 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); } KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); } #endif inline long double nanl(char const* arg) { return ::nanl(arg); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { - using ::Kokkos::nan; - using ::Kokkos::nanf; - using ::Kokkos::nanl; - }) // Exponential functions KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp) // FIXME_NVHPC nvc++ has issues with exp2 @@ -478,7 +452,6 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) // islessgreater // isunordered -#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp index ce8c9e152fa3cf6633dd231b1de3e3648e7dcbf7..f7e9e2a78c4516b6be1d26c5575321f025c1c178 100644 --- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -196,9 +196,10 @@ class MemoryPool { stats.consumed_superblocks++; stats.consumed_blocks += block_used; - stats.consumed_bytes += block_used * block_size; + stats.consumed_bytes += static_cast<size_t>(block_used) * block_size; stats.reserved_blocks += block_count - block_used; - stats.reserved_bytes += (block_count - block_used) * block_size; + stats.reserved_bytes += + static_cast<size_t>(block_count - block_used) * block_size; } } @@ -234,9 +235,9 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; KOKKOS_INLINE_FUNCTION MemoryPool() diff --git a/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp b/packages/kokkos/core/src/Kokkos_MinMax.hpp similarity index 83% rename from packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp rename to packages/kokkos/core/src/Kokkos_MinMax.hpp index 37a28a80b68e08dd154cf046804416aa581fffdd..5c60a88bfb1e3a64f3a26297d5d6850d6b0e012b 100644 --- a/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp +++ b/packages/kokkos/core/src/Kokkos_MinMax.hpp @@ -14,13 +14,8 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include <Kokkos_Macros.hpp> -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MIN_MAX_CLAMP_HPP -#define KOKKOS_MIN_MAX_CLAMP_HPP +#ifndef KOKKOS_MIN_MAX_HPP +#define KOKKOS_MIN_MAX_HPP #include <Kokkos_Macros.hpp> #include <Kokkos_Pair.hpp> @@ -29,22 +24,6 @@ static_assert(false, namespace Kokkos { -// clamp -template <class T> -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi) { - KOKKOS_EXPECTS(!(hi < lo)); - return (value < lo) ? lo : (hi < value) ? hi : value; -} - -template <class T, class ComparatorType> -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi, - ComparatorType comp) { - KOKKOS_EXPECTS(!comp(hi, lo)); - return comp(value, lo) ? lo : comp(hi, value) ? hi : value; -} - // max template <class T> constexpr KOKKOS_INLINE_FUNCTION const T& max(const T& a, const T& b) { @@ -199,15 +178,6 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair<T, T> minmax( return result; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Experimental { -using ::Kokkos::clamp; -using ::Kokkos::max; -using ::Kokkos::min; -using ::Kokkos::minmax; -} // namespace Experimental -#endif - } // namespace Kokkos #endif diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp index 118bf52c05fa027cb26f53e844b8708e5c723ff2..1304d3ba9260ee677368803a0f8bb281c7dca377 100644 --- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -114,7 +114,7 @@ template <> struct signaling_NaN_helper<long double> { static constexpr long dou #endif template <class> struct digits_helper {}; template <> struct digits_helper<bool> { static constexpr int value = 1; }; -template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; }; +template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed_v<char>; }; template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; }; template <> struct digits_helper<unsigned char> { static constexpr int value = CHAR_BIT; }; template <> struct digits_helper<short> { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp index 7127c78280e2de74ab1e90c67bb0af9bc86ab080..c44d1f231098bf3dc444d2266b58ea49f6f28328 100644 --- a/packages/kokkos/core/src/Kokkos_Pair.hpp +++ b/packages/kokkos/core/src/Kokkos_Pair.hpp @@ -28,6 +28,7 @@ #endif #include <Kokkos_Macros.hpp> +#include <Kokkos_Swap.hpp> #include <utility> namespace Kokkos { @@ -412,12 +413,13 @@ KOKKOS_FORCEINLINE_FUNCTION pair<T1&, T2&> tie(T1& x, T2& y) { return (pair<T1&, T2&>(x, y)); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 // // Specialization of Kokkos::pair for a \c void second argument. This // is not actually a "pair"; it only contains one element, the first. // template <class T1> -struct pair<T1, void> { +struct KOKKOS_DEPRECATED pair<T1, void> { using first_type = T1; using second_type = void; @@ -447,44 +449,52 @@ struct pair<T1, void> { // Specialization of relational operators for Kokkos::pair<T1,void>. // +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif template <class T1> -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( const pair<T1, void>& lhs, const pair<T1, void>& rhs) { return lhs.first == rhs.first; } template <class T1> -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( const pair<T1, void>& lhs, const pair<T1, void>& rhs) { return !(lhs == rhs); } template <class T1> -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( const pair<T1, void>& lhs, const pair<T1, void>& rhs) { return lhs.first < rhs.first; } template <class T1> -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( const pair<T1, void>& lhs, const pair<T1, void>& rhs) { return !(rhs < lhs); } template <class T1> -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( const pair<T1, void>& lhs, const pair<T1, void>& rhs) { return rhs < lhs; } template <class T1> -KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( +KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair<T1, void>& lhs, const pair<T1, void>& rhs) { return !(lhs < rhs); } +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +#endif namespace Impl { - template <class T> struct is_pair_like : std::false_type {}; template <class T, class U> diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp index 484f6c0d5f4c02a1f506a88646329d5c46d8b310..24349e95aea118b84313da2cc36fd7cf9dda473f 100644 --- a/packages/kokkos/core/src/Kokkos_Parallel.hpp +++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp @@ -72,19 +72,19 @@ struct FunctorPolicyExecutionSpace { static_assert( !is_detected<execution_space_t, Policy>::value || !is_detected<execution_space_t, Functor>::value || - std::is_same<policy_execution_space, functor_execution_space>::value, + std::is_same_v<policy_execution_space, functor_execution_space>, "A policy with an execution space and a functor with an execution space " "are given but the execution space types do not match!"); static_assert(!is_detected<execution_space_t, Policy>::value || !is_detected<device_type_t, Functor>::value || - std::is_same<policy_execution_space, - functor_device_type_execution_space>::value, + std::is_same_v<policy_execution_space, + functor_device_type_execution_space>, "A policy with an execution space and a functor with a device " "type are given but the execution space types do not match!"); static_assert(!is_detected<device_type_t, Functor>::value || !is_detected<execution_space_t, Functor>::value || - std::is_same<functor_device_type_execution_space, - functor_execution_space>::value, + std::is_same_v<functor_device_type_execution_space, + functor_execution_space>, "A functor with both an execution space and device type is " "given but their execution space types do not match!"); @@ -134,12 +134,14 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { uint64_t kpID = 0; - ExecPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID); + const auto& inner_policy = response.policy; - Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelFor<FunctorType, ExecPolicy> closure(functor, inner_policy); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelFor<FunctorType, ExecPolicy>>(functor, inner_policy); closure.execute(); @@ -348,14 +350,16 @@ template <class ExecutionPolicy, class FunctorType, std::enable_if_t<is_execution_policy<ExecutionPolicy>::value>> inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { - uint64_t kpID = 0; - ExecutionPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + uint64_t kpID = 0; + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID); + const auto& inner_policy = response.policy; - Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelScan<FunctorType, ExecutionPolicy> closure(functor, - inner_policy); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelScan<FunctorType, ExecutionPolicy>>(functor, + inner_policy); closure.execute(); @@ -398,18 +402,19 @@ inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); if constexpr (Kokkos::is_view<ReturnType>::value) { - Kokkos::Impl::shared_allocation_tracking_disable(); - Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy, - typename ReturnType::value_type> - closure(functor, inner_policy, return_value); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy, + typename ReturnType::value_type>>( + functor, inner_policy, return_value); closure.execute(); } else { - Kokkos::Impl::shared_allocation_tracking_disable(); Kokkos::View<ReturnType, Kokkos::HostSpace> view(&return_value); - Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy, ReturnType> - closure(functor, inner_policy, view); - Kokkos::Impl::shared_allocation_tracking_enable(); + auto closure = + Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< + Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy, + ReturnType>>(functor, inner_policy, + view); closure.execute(); } diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index d499eba6dcca49a01a420872aa41d97c20d939f5..3b89d184f2a417ed38f7380473a27f67fc34c0ba 100644 --- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -72,8 +72,8 @@ struct Sum { }; template <typename Scalar, typename... Properties> -Sum(View<Scalar, Properties...> const&) - ->Sum<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE Sum(View<Scalar, Properties...> const&) + -> Sum<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct Prod { @@ -117,8 +117,8 @@ struct Prod { }; template <typename Scalar, typename... Properties> -Prod(View<Scalar, Properties...> const&) - ->Prod<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE Prod(View<Scalar, Properties...> const&) + -> Prod<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct Min { @@ -164,8 +164,8 @@ struct Min { }; template <typename Scalar, typename... Properties> -Min(View<Scalar, Properties...> const&) - ->Min<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE Min(View<Scalar, Properties...> const&) + -> Min<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct Max { @@ -212,8 +212,8 @@ struct Max { }; template <typename Scalar, typename... Properties> -Max(View<Scalar, Properties...> const&) - ->Max<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE Max(View<Scalar, Properties...> const&) + -> Max<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct LAnd { @@ -258,8 +258,8 @@ struct LAnd { }; template <typename Scalar, typename... Properties> -LAnd(View<Scalar, Properties...> const&) - ->LAnd<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE LAnd(View<Scalar, Properties...> const&) + -> LAnd<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct LOr { @@ -305,8 +305,8 @@ struct LOr { }; template <typename Scalar, typename... Properties> -LOr(View<Scalar, Properties...> const&) - ->LOr<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE LOr(View<Scalar, Properties...> const&) + -> LOr<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct BAnd { @@ -352,8 +352,8 @@ struct BAnd { }; template <typename Scalar, typename... Properties> -BAnd(View<Scalar, Properties...> const&) - ->BAnd<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE BAnd(View<Scalar, Properties...> const&) + -> BAnd<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Space> struct BOr { @@ -399,8 +399,8 @@ struct BOr { }; template <typename Scalar, typename... Properties> -BOr(View<Scalar, Properties...> const&) - ->BOr<Scalar, typename View<Scalar, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE BOr(View<Scalar, Properties...> const&) + -> BOr<Scalar, typename View<Scalar, Properties...>::memory_space>; template <class Scalar, class Index> struct ValLocScalar { @@ -438,7 +438,12 @@ struct MinLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity<index_type>::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -458,10 +463,10 @@ struct MinLoc { }; template <typename Scalar, typename Index, typename... Properties> -MinLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&) - ->MinLoc<Scalar, Index, - typename View<ValLocScalar<Scalar, Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MinLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&) -> MinLoc< + Scalar, Index, + typename View<ValLocScalar<Scalar, Index>, Properties...>::memory_space>; template <class Scalar, class Index, class Space> struct MaxLoc { @@ -493,7 +498,12 @@ struct MaxLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity<index_type>::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -513,10 +523,10 @@ struct MaxLoc { }; template <typename Scalar, typename Index, typename... Properties> -MaxLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&) - ->MaxLoc<Scalar, Index, - typename View<ValLocScalar<Scalar, Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MaxLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&) -> MaxLoc< + Scalar, Index, + typename View<ValLocScalar<Scalar, Index>, Properties...>::memory_space>; template <class Scalar> struct MinMaxScalar { @@ -577,9 +587,9 @@ struct MinMax { }; template <typename Scalar, typename... Properties> -MinMax(View<MinMaxScalar<Scalar>, Properties...> const&) - ->MinMax<Scalar, - typename View<MinMaxScalar<Scalar>, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE MinMax(View<MinMaxScalar<Scalar>, Properties...> const&) + -> MinMax<Scalar, + typename View<MinMaxScalar<Scalar>, Properties...>::memory_space>; template <class Scalar, class Index> struct MinMaxLocScalar { @@ -620,10 +630,16 @@ struct MinMaxLoc { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity<index_type>::min()) { + dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity<index_type>::min()) { + dest.max_loc = src.max_loc; } } @@ -646,10 +662,11 @@ struct MinMaxLoc { }; template <typename Scalar, typename Index, typename... Properties> -MinMaxLoc(View<MinMaxLocScalar<Scalar, Index>, Properties...> const&) - ->MinMaxLoc<Scalar, Index, - typename View<MinMaxLocScalar<Scalar, Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE MinMaxLoc( + View<MinMaxLocScalar<Scalar, Index>, Properties...> const&) + -> MinMaxLoc<Scalar, Index, + typename View<MinMaxLocScalar<Scalar, Index>, + Properties...>::memory_space>; // -------------------------------------------------- // reducers added to support std algorithms @@ -713,10 +730,11 @@ struct MaxFirstLoc { }; template <typename Scalar, typename Index, typename... Properties> -MaxFirstLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&) - ->MaxFirstLoc<Scalar, Index, - typename View<ValLocScalar<Scalar, Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( + View<ValLocScalar<Scalar, Index>, Properties...> const&) + -> MaxFirstLoc<Scalar, Index, + typename View<ValLocScalar<Scalar, Index>, + Properties...>::memory_space>; // // MaxFirstLocCustomComparator @@ -782,11 +800,11 @@ struct MaxFirstLocCustomComparator { template <typename Scalar, typename Index, typename ComparatorType, typename... Properties> -MaxFirstLocCustomComparator( +KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View<ValLocScalar<Scalar, Index>, Properties...> const&, ComparatorType) - ->MaxFirstLocCustomComparator<Scalar, Index, ComparatorType, - typename View<ValLocScalar<Scalar, Index>, - Properties...>::memory_space>; + -> MaxFirstLocCustomComparator<Scalar, Index, ComparatorType, + typename View<ValLocScalar<Scalar, Index>, + Properties...>::memory_space>; // // MinFirstLoc @@ -846,10 +864,11 @@ struct MinFirstLoc { }; template <typename Scalar, typename Index, typename... Properties> -MinFirstLoc(View<ValLocScalar<Scalar, Index>, Properties...> const&) - ->MinFirstLoc<Scalar, Index, - typename View<ValLocScalar<Scalar, Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE MinFirstLoc( + View<ValLocScalar<Scalar, Index>, Properties...> const&) + -> MinFirstLoc<Scalar, Index, + typename View<ValLocScalar<Scalar, Index>, + Properties...>::memory_space>; // // MinFirstLocCustomComparator @@ -915,11 +934,11 @@ struct MinFirstLocCustomComparator { template <typename Scalar, typename Index, typename ComparatorType, typename... Properties> -MinFirstLocCustomComparator( +KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View<ValLocScalar<Scalar, Index>, Properties...> const&, ComparatorType) - ->MinFirstLocCustomComparator<Scalar, Index, ComparatorType, - typename View<ValLocScalar<Scalar, Index>, - Properties...>::memory_space>; + -> MinFirstLocCustomComparator<Scalar, Index, ComparatorType, + typename View<ValLocScalar<Scalar, Index>, + Properties...>::memory_space>; // // MinMaxFirstLastLoc @@ -990,10 +1009,11 @@ struct MinMaxFirstLastLoc { }; template <typename Scalar, typename Index, typename... Properties> -MinMaxFirstLastLoc(View<MinMaxLocScalar<Scalar, Index>, Properties...> const&) - ->MinMaxFirstLastLoc<Scalar, Index, - typename View<MinMaxLocScalar<Scalar, Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( + View<MinMaxLocScalar<Scalar, Index>, Properties...> const&) + -> MinMaxFirstLastLoc<Scalar, Index, + typename View<MinMaxLocScalar<Scalar, Index>, + Properties...>::memory_space>; // // MinMaxFirstLastLocCustomComparator @@ -1069,9 +1089,9 @@ struct MinMaxFirstLastLocCustomComparator { template <typename Scalar, typename Index, typename ComparatorType, typename... Properties> -MinMaxFirstLastLocCustomComparator( +KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View<MinMaxLocScalar<Scalar, Index>, Properties...> const&, ComparatorType) - ->MinMaxFirstLastLocCustomComparator< + -> MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, typename View<MinMaxLocScalar<Scalar, Index>, Properties...>::memory_space>; @@ -1133,9 +1153,9 @@ struct FirstLoc { }; template <typename Index, typename... Properties> -FirstLoc(View<FirstLocScalar<Index>, Properties...> const&) - ->FirstLoc<Index, typename View<FirstLocScalar<Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +FirstLoc(View<FirstLocScalar<Index>, Properties...> const&) -> FirstLoc< + Index, typename View<FirstLocScalar<Index>, Properties...>::memory_space>; // // LastLoc @@ -1194,9 +1214,9 @@ struct LastLoc { }; template <typename Index, typename... Properties> -LastLoc(View<LastLocScalar<Index>, Properties...> const&) - ->LastLoc<Index, - typename View<LastLocScalar<Index>, Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE LastLoc(View<LastLocScalar<Index>, Properties...> const&) + -> LastLoc<Index, typename View<LastLocScalar<Index>, + Properties...>::memory_space>; template <class Index> struct StdIsPartScalar { @@ -1261,9 +1281,10 @@ struct StdIsPartitioned { }; template <typename Index, typename... Properties> -StdIsPartitioned(View<StdIsPartScalar<Index>, Properties...> const&) - ->StdIsPartitioned<Index, typename View<StdIsPartScalar<Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( + View<StdIsPartScalar<Index>, Properties...> const&) + -> StdIsPartitioned<Index, typename View<StdIsPartScalar<Index>, + Properties...>::memory_space>; template <class Index> struct StdPartPointScalar { @@ -1323,9 +1344,10 @@ struct StdPartitionPoint { }; template <typename Index, typename... Properties> -StdPartitionPoint(View<StdPartPointScalar<Index>, Properties...> const&) - ->StdPartitionPoint<Index, typename View<StdPartPointScalar<Index>, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( + View<StdPartPointScalar<Index>, Properties...> const&) + -> StdPartitionPoint<Index, typename View<StdPartPointScalar<Index>, + Properties...>::memory_space>; } // namespace Kokkos namespace Kokkos { @@ -1395,9 +1417,9 @@ struct ParallelReduceReturnValue< template <class ReturnType, class FunctorType> struct ParallelReduceReturnValue< std::enable_if_t<!Kokkos::is_view<ReturnType>::value && - (!std::is_array<ReturnType>::value && - !std::is_pointer<ReturnType>::value) && - !Kokkos::is_reducer<ReturnType>::value>, + (!std::is_array_v<ReturnType> && + !std::is_pointer_v< + ReturnType>)&&!Kokkos::is_reducer<ReturnType>::value>, ReturnType, FunctorType> { using return_type = Kokkos::View<ReturnType, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1413,8 +1435,8 @@ struct ParallelReduceReturnValue< template <class ReturnType, class FunctorType> struct ParallelReduceReturnValue< - std::enable_if_t<(std::is_array<ReturnType>::value || - std::is_pointer<ReturnType>::value)>, + std::enable_if_t<(std::is_array_v<ReturnType> || + std::is_pointer_v<ReturnType>)>, ReturnType, FunctorType> { using return_type = Kokkos::View<std::remove_const_t<ReturnType>, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1425,7 +1447,7 @@ struct ParallelReduceReturnValue< static return_type return_value(ReturnType& return_val, const FunctorType& functor) { - if (std::is_array<ReturnType>::value) + if (std::is_array_v<ReturnType>) return return_type(return_val); else return return_type(return_val, functor.value_count); @@ -1458,8 +1480,7 @@ struct ParallelReducePolicyType< template <class PolicyType, class FunctorType> struct ParallelReducePolicyType< - std::enable_if_t<std::is_integral<PolicyType>::value>, PolicyType, - FunctorType> { + std::enable_if_t<std::is_integral_v<PolicyType>>, PolicyType, FunctorType> { using execution_space = typename Impl::FunctorPolicyExecutionSpace<FunctorType, void>::execution_space; @@ -1492,28 +1513,29 @@ struct ParallelReduceAdaptor { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; - PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce<PassedReducerType>( - inner_policy, functor, label, kpID); - using ReducerSelector = - Kokkos::Impl::if_c<std::is_same<InvalidType, PassedReducerType>::value, + Kokkos::Impl::if_c<std::is_same_v<InvalidType, PassedReducerType>, FunctorType, PassedReducerType>; using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE, PolicyType, typename ReducerSelector::type, typename return_value_adapter::value_type>; - Kokkos::Impl::shared_allocation_tracking_disable(); - CombinedFunctorReducer functor_reducer( + using CombinedFunctorReducerType = + CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>; + + CombinedFunctorReducerType functor_reducer( functor, typename Analysis::Reducer( ReducerSelector::select(functor, return_value))); - - // FIXME Remove "Wrapper" once all backends implement the new interface - Impl::ParallelReduce<decltype(functor_reducer), PolicyType, - typename Impl::FunctorPolicyExecutionSpace< - FunctorType, PolicyType>::execution_space> - closure(functor_reducer, inner_policy, - return_value_adapter::return_value(return_value, functor)); - Kokkos::Impl::shared_allocation_tracking_enable(); + const auto& response = Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(policy, functor_reducer, + label, kpID); + const auto& inner_policy = response.policy; + + auto closure = construct_with_shared_allocation_tracking_disabled< + Impl::ParallelReduce<CombinedFunctorReducerType, PolicyType, + typename Impl::FunctorPolicyExecutionSpace< + FunctorType, PolicyType>::execution_space>>( + functor_reducer, inner_policy, + return_value_adapter::return_value(return_value, functor)); closure.execute(); Kokkos::Tools::Impl::end_parallel_reduce<PassedReducerType>( @@ -1527,7 +1549,7 @@ struct ParallelReduceAdaptor { template <typename Dummy = ReturnType> static inline std::enable_if_t<!(is_array_reduction && - std::is_pointer<Dummy>::value)> + std::is_pointer_v<Dummy>)> execute(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { execute_impl(label, policy, functor, return_value); @@ -1559,7 +1581,7 @@ struct ReducerHasTestReferenceFunction { static std::false_type test_func(...); enum { - value = std::is_same<std::true_type, decltype(test_func<T>(nullptr))>::value + value = std::is_same_v<std::true_type, decltype(test_func<T>(nullptr))> }; }; @@ -1602,7 +1624,7 @@ struct ParallelReduceFence { template <class... ArgsDeduced> static void fence(const ExecutionSpace& ex, const std::string& name, ArgsDeduced&&... args) { - if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced&&)args...)) { ex.fence(name); } } @@ -1654,11 +1676,11 @@ template <class PolicyType, class FunctorType, class ReturnType> inline std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value && !(Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value)> + std::is_pointer_v<ReturnType>)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const<ReturnType>::value, + !std::is_const_v<ReturnType>, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1675,11 +1697,11 @@ template <class PolicyType, class FunctorType, class ReturnType> inline std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value && !(Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value)> + std::is_pointer_v<ReturnType>)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const<ReturnType>::value, + !std::is_const_v<ReturnType>, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1695,11 +1717,11 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template <class FunctorType, class ReturnType> inline std::enable_if_t<!(Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value)> + std::is_pointer_v<ReturnType>)> parallel_reduce(const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const<ReturnType>::value, + !std::is_const_v<ReturnType>, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1719,11 +1741,11 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template <class FunctorType, class ReturnType> inline std::enable_if_t<!(Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value)> + std::is_pointer_v<ReturnType>)> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const<ReturnType>::value, + !std::is_const_v<ReturnType>, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1745,7 +1767,7 @@ template <class PolicyType, class FunctorType, class ReturnType> inline std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value && (Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value)> + std::is_pointer_v<ReturnType>)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1762,7 +1784,7 @@ template <class PolicyType, class FunctorType, class ReturnType> inline std::enable_if_t<Kokkos::is_execution_policy<PolicyType>::value && (Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value)> + std::is_pointer_v<ReturnType>)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1778,7 +1800,7 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template <class FunctorType, class ReturnType> inline std::enable_if_t<Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value> + std::is_pointer_v<ReturnType>> parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = @@ -1797,7 +1819,7 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template <class FunctorType, class ReturnType> inline std::enable_if_t<Kokkos::is_view<ReturnType>::value || Kokkos::is_reducer<ReturnType>::value || - std::is_pointer<ReturnType>::value> + std::is_pointer_v<ReturnType>> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = diff --git a/packages/kokkos/core/src/Kokkos_Printf.hpp b/packages/kokkos/core/src/Kokkos_Printf.hpp index 39f95825c3822a1a72c69517fd0f928d66cf4c84..63a4cce2aeb62ab0a091a715163a96adf970a13d 100644 --- a/packages/kokkos/core/src/Kokkos_Printf.hpp +++ b/packages/kokkos/core/src/Kokkos_Printf.hpp @@ -30,8 +30,11 @@ namespace Kokkos { // In contrast to std::printf, return void to get a consistent behavior across // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) +using ::printf; +#else template <typename... Args> -KOKKOS_FUNCTION void printf(const char* format, Args... args) { +KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL // Some compilers warn if "args" is empty and format is not a string literal if constexpr (sizeof...(Args) == 0) @@ -39,15 +42,13 @@ KOKKOS_FUNCTION void printf(const char* format, Args... args) { else sycl::ext::oneapi::experimental::printf(format, args...); #else - if constexpr (sizeof...(Args) == 0) ::printf("%s", format); - // FIXME_OPENMPTARGET non-string-literal argument used in printf is not - // supported for spir64 -#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)) + if constexpr (sizeof...(Args) == 0) + ::printf("%s", format); else ::printf(format, args...); #endif -#endif } +#endif } // namespace Kokkos diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index 29a04ac3b07ebb2b9676d84128583be1ef948036..1759c2b4a1c54a672a33751faafb0bafe07a9322 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -22,49 +22,34 @@ #endif #include <Kokkos_Macros.hpp> -#include <impl/Kokkos_Profiling_Interface.hpp> #include <impl/Kokkos_Profiling.hpp> #include <string> -namespace Kokkos { -namespace Profiling { +namespace Kokkos::Profiling { + +class [[nodiscard]] ProfilingSection { + uint32_t sectionID; -class ProfilingSection { public: - ProfilingSection(ProfilingSection const&) = delete; + ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; - ProfilingSection(const std::string& sectionName) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::createProfileSection(sectionName, &secID); - } - } - - void start() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::startSection(secID); - } +#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 + [[nodiscard]] +#endif + explicit ProfilingSection(const std::string& sectionName) { + Kokkos::Profiling::createProfileSection(sectionName, §ionID); } - void stop() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::stopSection(secID); - } - } + void start() { Kokkos::Profiling::startSection(sectionID); } - ~ProfilingSection() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::destroyProfileSection(secID); - } - } + void stop() { Kokkos::Profiling::stopSection(sectionID); } - protected: - uint32_t secID; + ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(sectionID); } }; -} // namespace Profiling -} // namespace Kokkos +} // namespace Kokkos::Profiling #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp index f45dfa324e9ff83672ea2cbaf7fcd6216f9f525c..a4168b9401fabec18282d94278df829e5e07b1ef 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -30,7 +30,7 @@ namespace Kokkos::Profiling { class [[nodiscard]] ScopedRegion { public: - ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp index a925e32a339e70e74e8de7b75445815179040792..f00e25fdb62934ec3ff4e69518d8def50381fea6 100644 --- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - uintptr_t increment = size * m_multiplier; + uintptr_t increment = static_cast<uintptr_t>(size) * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = diff --git a/packages/kokkos/core/src/Kokkos_Swap.hpp b/packages/kokkos/core/src/Kokkos_Swap.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2f849a13ab618f95a3b797612b5fcc5cbd239994 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Swap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SWAP_HPP +#define KOKKOS_SWAP_HPP + +#include <Kokkos_Macros.hpp> + +#include <cstddef> +#include <type_traits> +#include <utility> + +namespace Kokkos { + +template <class T> +KOKKOS_FUNCTION constexpr std::enable_if_t<std::is_move_constructible_v<T> && + std::is_move_assignable_v<T>> +kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v<T>&& + std::is_nothrow_move_assignable_v<T>) { + T t(std::move(a)); + a = std::move(b); + b = std::move(t); +} + +namespace Impl { + +template <class T> +struct is_swappable { + template <class U> + static decltype(kokkos_swap(std::declval<T&>(), std::declval<T&>())) + test_swap(int); + struct Nope; + template <class U> + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v<decltype(test_swap<T>(0)), Nope>; +}; + +template <class T> +inline constexpr bool is_nothrow_swappable_v = + noexcept(kokkos_swap(std::declval<T&>(), std::declval<T&>())); + +} // namespace Impl + +template <class T, std::size_t N> +KOKKOS_FUNCTION constexpr std::enable_if_t<Impl::is_swappable<T>::value> +kokkos_swap(T (&a)[N], T (&b)[N]) noexcept(Impl::is_nothrow_swappable_v<T>) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp index 869a5f8ec26a99e21be0404d3b60056bdd027775..3edecb4502a8a8a06f152df30b08220f9e9a1caa 100644 --- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_TASKSCHEDULER_HPP #define KOKKOS_TASKSCHEDULER_HPP @@ -44,6 +50,11 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -54,7 +65,7 @@ class TaskExec; } // end namespace Impl template <class ExecSpace, class QueueType> -class BasicTaskScheduler : public Impl::TaskSchedulerBase { +class KOKKOS_DEPRECATED BasicTaskScheduler : public Impl::TaskSchedulerBase { public: using scheduler_type = BasicTaskScheduler; using execution_space = ExecSpace; @@ -494,8 +505,8 @@ namespace Kokkos { // Construct a TaskTeam execution policy template <class T, class Scheduler> -Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskTeam, - Kokkos::BasicFuture<T, Scheduler>> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskTeam, Kokkos::BasicFuture<T, Scheduler>> KOKKOS_INLINE_FUNCTION TaskTeam(Kokkos::BasicFuture<T, Scheduler> arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -503,7 +514,8 @@ Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskTeam, } template <class Scheduler> -Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskTeam, Scheduler> +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskTeam, + Scheduler> KOKKOS_INLINE_FUNCTION TaskTeam( Scheduler arg_scheduler, std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value, TaskPriority> @@ -512,18 +524,18 @@ Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskTeam, Scheduler> } template <class Scheduler, class PredecessorFuture> -Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskTeam, Scheduler, - PredecessorFuture> +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskTeam, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value && Kokkos::is_future<PredecessorFuture>::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same<typename PredecessorFuture::scheduler_type, - Scheduler>::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v<typename PredecessorFuture::scheduler_type, Scheduler>, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -531,8 +543,8 @@ Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskTeam, Scheduler, // Construct a TaskSingle execution policy template <class T, class Scheduler> -Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskSingle, - Kokkos::BasicFuture<T, Scheduler>> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskSingle, Kokkos::BasicFuture<T, Scheduler>> KOKKOS_INLINE_FUNCTION TaskSingle(Kokkos::BasicFuture<T, Scheduler> arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -540,7 +552,8 @@ Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskSingle, } template <class Scheduler> -Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskSingle, Scheduler> +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskSingle, + Scheduler> KOKKOS_INLINE_FUNCTION TaskSingle( Scheduler arg_scheduler, std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value, TaskPriority> @@ -549,18 +562,18 @@ Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskSingle, Scheduler> } template <class Scheduler, class PredecessorFuture> -Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskSingle, Scheduler, - PredecessorFuture> +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskSingle, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t<Kokkos::is_scheduler<Scheduler>::value && Kokkos::is_future<PredecessorFuture>::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same<typename PredecessorFuture::scheduler_type, - Scheduler>::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v<typename PredecessorFuture::scheduler_type, Scheduler>, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -575,7 +588,8 @@ Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskSingle, Scheduler, */ template <int TaskEnum, typename Scheduler, typename DepFutureType, typename FunctorType> -typename Scheduler::template future_type_for_functor<std::decay_t<FunctorType>> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t<FunctorType>> host_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType> arg_policy, FunctorType&& arg_functor) { @@ -606,7 +620,8 @@ host_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType> */ template <int TaskEnum, typename Scheduler, typename DepFutureType, typename FunctorType> -typename Scheduler::template future_type_for_functor<std::decay_t<FunctorType>> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t<FunctorType>> KOKKOS_INLINE_FUNCTION task_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType> arg_policy, @@ -633,7 +648,7 @@ typename Scheduler::template future_type_for_functor<std::decay_t<FunctorType>> * 2) High, Normal, or Low priority */ template <typename FunctorType, typename T> -void KOKKOS_INLINE_FUNCTION +KOKKOS_DEPRECATED void KOKKOS_INLINE_FUNCTION respawn(FunctorType* arg_self, T const& arg, TaskPriority const& arg_priority = TaskPriority::Regular) { static_assert(Kokkos::is_future<T>::value || Kokkos::is_scheduler<T>::value, @@ -656,7 +671,8 @@ respawn(FunctorType* arg_self, T const& arg, // Wait for all runnable tasks to complete template <class ExecSpace, class QueueType> -inline void wait(BasicTaskScheduler<ExecSpace, QueueType> const& scheduler) { +KOKKOS_DEPRECATED inline void wait( + BasicTaskScheduler<ExecSpace, QueueType> const& scheduler) { using scheduler_type = BasicTaskScheduler<ExecSpace, QueueType>; scheduler_type::specialization::execute(scheduler); // scheduler.m_queue->execute(); @@ -664,6 +680,10 @@ inline void wait(BasicTaskScheduler<ExecSpace, QueueType> const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp index 203fb16eaf0b4de3757df42c450bf684d9a547e5..83e1c06db9b948d29f424464259394a862c763b0 100644 --- a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -31,31 +31,40 @@ static_assert(false, #include <Kokkos_Core_fwd.hpp> //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // Forward declarations used in Impl::TaskQueue template <typename ValueType, typename Scheduler> -class BasicFuture; +class KOKKOS_DEPRECATED BasicFuture; template <class Space, class Queue> -class SimpleTaskScheduler; +class KOKKOS_DEPRECATED SimpleTaskScheduler; template <class Space, class Queue> -class BasicTaskScheduler; +class KOKKOS_DEPRECATED BasicTaskScheduler; template <typename Space> -struct is_scheduler : public std::false_type {}; +struct KOKKOS_DEPRECATED is_scheduler : public std::false_type {}; template <class Space, class Queue> -struct is_scheduler<BasicTaskScheduler<Space, Queue>> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler<BasicTaskScheduler<Space, Queue>> + : public std::true_type {}; template <class Space, class Queue> -struct is_scheduler<SimpleTaskScheduler<Space, Queue>> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler<SimpleTaskScheduler<Space, Queue>> + : public std::true_type {}; -enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; +enum class KOKKOS_DEPRECATED TaskPriority : int { + High = 0, + Regular = 1, + Low = 2 +}; } // namespace Kokkos @@ -141,28 +150,28 @@ using default_tasking_memory_space_for_execution_space_t = namespace Kokkos { template <typename Space> -using DeprecatedTaskScheduler = BasicTaskScheduler< +using DeprecatedTaskScheduler KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>>>; template <typename Space> -using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< +using DeprecatedTaskSchedulerMultiple KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueueMultiple< Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>>>; template <typename Space> -using TaskScheduler = SimpleTaskScheduler< +using TaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::SingleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>, Impl::TaskQueueTraitsLockBased>>; template <typename Space> -using TaskSchedulerMultiple = SimpleTaskScheduler< +using TaskSchedulerMultiple KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>, @@ -172,7 +181,7 @@ using TaskSchedulerMultiple = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t<Space>>>>>; template <typename Space> -using ChaseLevTaskScheduler = SimpleTaskScheduler< +using ChaseLevTaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>, @@ -182,7 +191,7 @@ using ChaseLevTaskScheduler = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t<Space>>>>>; template <class Space, class QueueType> -void wait(BasicTaskScheduler<Space, QueueType> const&); +KOKKOS_DEPRECATED void wait(BasicTaskScheduler<Space, QueueType> const&); namespace Impl { @@ -204,6 +213,10 @@ struct TaskPolicyData; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/packages/kokkos/core/src/Kokkos_Timer.hpp b/packages/kokkos/core/src/Kokkos_Timer.hpp index a210b6ff18329eb709243ef688460ddaaf6f2603..ab31484d76ac2a13730778c76277de1bdc376ea8 100644 --- a/packages/kokkos/core/src/Kokkos_Timer.hpp +++ b/packages/kokkos/core/src/Kokkos_Timer.hpp @@ -48,7 +48,7 @@ class Timer { inline Timer() { reset(); } - Timer(const Timer&) = delete; + Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; inline double seconds() const { diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp index 618401654e74042d8fdcd98a1a7a242f661a7199..fcb061b378f2ddac258b09a301297eb40d432aa3 100644 --- a/packages/kokkos/core/src/Kokkos_Tuners.hpp +++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t); VariableValue make_variable_value(size_t, double); SetOrRange make_candidate_range(double lower, double upper, double step, bool openLower, bool openUpper); +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); size_t get_new_context_id(); void begin_context(size_t context_id); void end_context(size_t context_id); @@ -256,13 +258,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices, template <typename PointType, typename ArrayType> struct GetPoint; -template <typename PointType, size_t X> -struct GetPoint<PointType, - std::array<Kokkos::Tools::Experimental::VariableValue, X>> { +template <typename PointType, size_t ArraySize> +struct GetPoint< + PointType, + std::array<Kokkos::Tools::Experimental::VariableValue, ArraySize>> { using index_set_type = - std::array<Kokkos::Tools::Experimental::VariableValue, X>; + std::array<Kokkos::Tools::Experimental::VariableValue, ArraySize>; static auto build(const PointType& in, const index_set_type& indices) { - return get_point_helper(in, indices, std::make_index_sequence<X>{}); + return get_point_helper(in, indices, std::make_index_sequence<ArraySize>{}); } }; @@ -411,18 +414,19 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> { TunerType tuner; public: - TeamSizeTuner() = default; + TeamSizeTuner() = default; TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; template <typename ViableConfigurationCalculator, typename Functor, typename TagType, typename... Properties> TeamSizeTuner(const std::string& name, - Kokkos::TeamPolicy<Properties...>& policy, + const Kokkos::TeamPolicy<Properties...>& policy_in, const Functor& functor, const TagType& tag, ViableConfigurationCalculator calc) { - using PolicyType = Kokkos::TeamPolicy<Properties...>; + using PolicyType = Kokkos::TeamPolicy<Properties...>; + PolicyType policy(policy_in); auto initial_vector_length = policy.impl_vector_length(); if (initial_vector_length < 1) { policy.impl_set_vector_length(1); @@ -504,7 +508,8 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> { } template <typename... Properties> - void tune(Kokkos::TeamPolicy<Properties...>& policy) { + auto tune(const Kokkos::TeamPolicy<Properties...>& policy_in) { + Kokkos::TeamPolicy<Properties...> policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); auto team_size = std::get<1>(configuration); @@ -514,6 +519,111 @@ class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> { policy.impl_set_vector_length(vector_length); } } + return policy; + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + TunerType get_tuner() const { return tuner; } +}; +namespace Impl { +template <class T> +struct tuning_type_for; + +template <> +struct tuning_type_for<double> { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + static double get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.double_value; + } +}; +template <> +struct tuning_type_for<int64_t> { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + static int64_t get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.int_value; + } +}; +} // namespace Impl +template <class Bound> +class SingleDimensionalRangeTuner { + size_t id; + size_t context; + using tuning_util = Impl::tuning_type_for<Bound>; + + Bound default_value; + + public: + SingleDimensionalRangeTuner() = default; + SingleDimensionalRangeTuner( + const std::string& name, + Kokkos::Tools::Experimental::StatisticalCategory category, + Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) { + default_value = default_val; + Kokkos::Tools::Experimental::VariableInfo info; + info.category = category; + info.candidates = make_candidate_range( + static_cast<Bound>(lower), static_cast<Bound>(upper), + static_cast<Bound>(step), false, false); + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.type = tuning_util::value; + id = Kokkos::Tools::Experimental::declare_output_type(name, info); + } + + Bound begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + Kokkos::Tools::Experimental::begin_context(context); + auto tuned_value = + Kokkos::Tools::Experimental::make_variable_value(id, default_value); + Kokkos::Tools::Experimental::request_output_values(context, 1, + &tuned_value); + return tuning_util::get(tuned_value); + } + + void end() { Kokkos::Tools::Experimental::end_context(context); } + + template <typename Functor> + void with_tuned_value(Functor& func) { + func(begin()); + end(); + } +}; + +class RangePolicyOccupancyTuner { + private: + using TunerType = SingleDimensionalRangeTuner<int64_t>; + TunerType tuner; + + public: + RangePolicyOccupancyTuner() = default; + template <typename ViableConfigurationCalculator, typename Functor, + typename TagType, typename... Properties> + RangePolicyOccupancyTuner(const std::string& name, + const Kokkos::RangePolicy<Properties...>&, + const Functor&, const TagType&, + ViableConfigurationCalculator) + : tuner(TunerType(name, + Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_ratio, + 100, 5, 100, 5)) {} + + template <typename... Properties> + auto tune(const Kokkos::RangePolicy<Properties...>& policy_in) { + Kokkos::RangePolicy<Properties...> policy(policy_in); + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto occupancy = tuner.begin(); + policy.impl_set_desired_occupancy( + Kokkos::Experimental::DesiredOccupancy{static_cast<int>(occupancy)}); + } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { @@ -577,11 +687,13 @@ struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> { policy.impl_change_tile_size({std::get<Indices>(tuple)...}); } template <typename... Properties> - void tune(Kokkos::MDRangePolicy<Properties...>& policy) { + auto tune(const Kokkos::MDRangePolicy<Properties...>& policy_in) { + Kokkos::MDRangePolicy<Properties...> policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); set_policy_tile(policy, configuration, std::make_index_sequence<rank>{}); } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { diff --git a/packages/kokkos/core/src/Kokkos_TypeInfo.hpp b/packages/kokkos/core/src/Kokkos_TypeInfo.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e5710da2e3d5914444074c939ded9d3388e3f78e --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_TypeInfo.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TYPE_INFO_HPP +#define KOKKOS_TYPE_INFO_HPP + +#include <array> +#include <string_view> +#include <utility> + +#include <Kokkos_Macros.hpp> + +// Intel C++ Compiler Classic version 2021.2.0 works but 2021.1.2 doesn't +// Both have __INTEL_COMPILER defined to 2021 so using +// __INTEL_COMPILER_BUILD_DATE to discriminate. +// Experimenting on the compiler explorer gave +// icc version | __INTEL_COMPILER | __INTEL_COMPILER_BUILD_DATE +// 2021.1.2 | 2021 | 20201208 +// 2021.2.0 | 2021 | 20210228 +// NVCC versions less than 11.3.0 segfault when that header is included +// NVCC+MSVC doesn't work at all - it simply reports "T" inside type_name +#if (!defined(KOKKOS_COMPILER_INTEL) || \ + (__INTEL_COMPILER_BUILD_DATE >= 20210228)) && \ + (!defined(KOKKOS_COMPILER_NVCC) || (KOKKOS_COMPILER_NVCC >= 1130)) && \ + (!(defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_COMPILER_MSVC))) + +#define KOKKOS_ENABLE_IMPL_TYPEINFO + +namespace Kokkos::Impl { + +template <size_t N> +constexpr std::array<char, N> to_array(std::string_view src) { + std::array<char, N> dst{}; + for (size_t i = 0; i < N; ++i) { + dst[i] = src[i]; + } + return dst; +} + +template <class T> +constexpr auto type_name() { +#if defined(__clang__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(__GNUC__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[with T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(_MSC_VER) + constexpr std::string_view func = __FUNCSIG__; + constexpr std::string_view prefix{"type_name<"}; + constexpr std::string_view suffix{">(void)"}; +#else +#error bug +#endif + constexpr auto beg = func.find(prefix) + prefix.size(); + constexpr auto end = func.rfind(suffix); + static_assert(beg != std::string_view::npos); + static_assert(end != std::string_view::npos); + return to_array<end - beg>(func.substr(beg, end)); +} + +template <class T> +class TypeInfo { + static constexpr auto value_ = type_name<T>(); + + public: + static constexpr std::string_view name() noexcept { + return {value_.data(), value_.size()}; + } +}; + +} // namespace Kokkos::Impl + +#else // out of luck, using Intel C++ Compiler Classic + +namespace Kokkos::Impl { + +template <class T> +class TypeInfo { + public: + static constexpr std::string_view name() noexcept { return "not supported"; } +}; + +} // namespace Kokkos::Impl + +#endif + +#endif diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp index bcbb28014cd935acfb7c7919be867f5b769be7ef..d5b352876c30fa7a21d1533bfb7e320b98b7a43c 100644 --- a/packages/kokkos/core/src/Kokkos_View.hpp +++ b/packages/kokkos/core/src/Kokkos_View.hpp @@ -22,1918 +22,10 @@ static_assert(false, #ifndef KOKKOS_VIEW_HPP #define KOKKOS_VIEW_HPP -#include <type_traits> -#include <string> -#include <algorithm> -#include <initializer_list> - -#include <Kokkos_Core_fwd.hpp> -#include <Kokkos_HostSpace.hpp> -#include <Kokkos_MemoryTraits.hpp> -#include <Kokkos_ExecPolicy.hpp> -#include <View/Hooks/Kokkos_ViewHooks.hpp> - -#include <impl/Kokkos_Tools.hpp> -#include <impl/Kokkos_Utilities.hpp> - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -#include <View/MDSpan/Kokkos_MDSpan_Extents.hpp> -#endif -#include <Kokkos_MinMaxClamp.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <class DataType> -struct ViewArrayAnalysis; - -template <class DataType, class ArrayLayout, - typename ValueType = - typename ViewArrayAnalysis<DataType>::non_const_value_type> -struct ViewDataAnalysis; - -template <class, class...> -class ViewMapping { - public: - enum : bool { is_assignable_data_type = false }; - enum : bool { is_assignable = false }; -}; - -template <typename IntType> -constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( - const IntType i0, const IntType i1, const IntType i2, const IntType i3, - const IntType i4, const IntType i5, const IntType i6, const IntType i7) { - static_assert(std::is_integral<IntType>::value, - "count_valid_integers() must have integer arguments."); - - return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + - (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + - (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + - (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); -} - -KOKKOS_INLINE_FUNCTION -void runtime_check_rank(const size_t rank, const size_t dyn_rank, - const bool is_void_spec, const size_t i0, - const size_t i1, const size_t i2, const size_t i3, - const size_t i4, const size_t i5, const size_t i6, - const size_t i7, const std::string& label) { - (void)(label); - - if (is_void_spec) { - const size_t num_passed_args = - count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); - - if (num_passed_args != dyn_rank && num_passed_args != rank) { - KOKKOS_IF_ON_HOST( - const std::string message = - "Constructor for Kokkos View '" + label + - "' has mismatched number of arguments. Number of arguments = " + - std::to_string(num_passed_args) + - " but dynamic rank = " + std::to_string(dyn_rank) + " \n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " - "mismatched number of arguments.");) - } - } -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -// Class to provide a uniform type -namespace Kokkos { -namespace Impl { -template <class ViewType, int Traits = 0> -struct ViewUniformType; -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template <class DataType, class... Properties> -struct ViewTraits; - -template <> -struct ViewTraits<void> { - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = void; - using specialize = void; - using hooks_policy = void; -}; - -template <class... Prop> -struct ViewTraits<void, void, Prop...> { - // Ignore an extraneous 'void' - using execution_space = typename ViewTraits<void, Prop...>::execution_space; - using memory_space = typename ViewTraits<void, Prop...>::memory_space; - using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; - using array_layout = typename ViewTraits<void, Prop...>::array_layout; - using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; - using specialize = typename ViewTraits<void, Prop...>::specialize; - using hooks_policy = typename ViewTraits<void, Prop...>::hooks_policy; -}; - -template <class HooksPolicy, class... Prop> -struct ViewTraits< - std::enable_if_t<Kokkos::Experimental::is_hooks_policy<HooksPolicy>::value>, - HooksPolicy, Prop...> { - using execution_space = typename ViewTraits<void, Prop...>::execution_space; - using memory_space = typename ViewTraits<void, Prop...>::memory_space; - using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; - using array_layout = typename ViewTraits<void, Prop...>::array_layout; - using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; - using specialize = typename ViewTraits<void, Prop...>::specialize; - using hooks_policy = HooksPolicy; -}; - -template <class ArrayLayout, class... Prop> -struct ViewTraits<std::enable_if_t<Kokkos::is_array_layout<ArrayLayout>::value>, - ArrayLayout, Prop...> { - // Specify layout, keep subsequent space and memory traits arguments - - using execution_space = typename ViewTraits<void, Prop...>::execution_space; - using memory_space = typename ViewTraits<void, Prop...>::memory_space; - using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; - using array_layout = ArrayLayout; - using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; - using specialize = typename ViewTraits<void, Prop...>::specialize; - using hooks_policy = typename ViewTraits<void, Prop...>::hooks_policy; -}; - -template <class Space, class... Prop> -struct ViewTraits<std::enable_if_t<Kokkos::is_space<Space>::value>, Space, - Prop...> { - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( - std::is_same<typename ViewTraits<void, Prop...>::execution_space, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::memory_space, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::HostMirrorSpace, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::array_layout, - void>::value, - "Only one View Execution or Memory Space template argument"); - - using execution_space = typename Space::execution_space; - using memory_space = typename Space::memory_space; - using HostMirrorSpace = - typename Kokkos::Impl::HostMirror<Space>::Space::memory_space; - using array_layout = typename execution_space::array_layout; - using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; - using specialize = typename ViewTraits<void, Prop...>::specialize; - using hooks_policy = typename ViewTraits<void, Prop...>::hooks_policy; -}; - -template <class MemoryTraits, class... Prop> -struct ViewTraits< - std::enable_if_t<Kokkos::is_memory_traits<MemoryTraits>::value>, - MemoryTraits, Prop...> { - // Specify memory trait, should not be any subsequent arguments - - static_assert( - std::is_same<typename ViewTraits<void, Prop...>::execution_space, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::memory_space, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::array_layout, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::memory_traits, - void>::value && - std::is_same<typename ViewTraits<void, Prop...>::hooks_policy, - void>::value, - "MemoryTrait is the final optional template argument for a View"); - - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = MemoryTraits; - using specialize = void; - using hooks_policy = void; -}; - -template <class DataType, class... Properties> -struct ViewTraits { - private: - // Unpack the properties arguments - using prop = ViewTraits<void, Properties...>; - - using ExecutionSpace = - std::conditional_t<!std::is_void<typename prop::execution_space>::value, - typename prop::execution_space, - Kokkos::DefaultExecutionSpace>; - - using MemorySpace = - std::conditional_t<!std::is_void<typename prop::memory_space>::value, - typename prop::memory_space, - typename ExecutionSpace::memory_space>; - - using ArrayLayout = - std::conditional_t<!std::is_void<typename prop::array_layout>::value, - typename prop::array_layout, - typename ExecutionSpace::array_layout>; - - using HostMirrorSpace = std::conditional_t< - !std::is_void<typename prop::HostMirrorSpace>::value, - typename prop::HostMirrorSpace, - typename Kokkos::Impl::HostMirror<ExecutionSpace>::Space>; - - using MemoryTraits = - std::conditional_t<!std::is_void<typename prop::memory_traits>::value, - typename prop::memory_traits, - typename Kokkos::MemoryManaged>; - - using HooksPolicy = - std::conditional_t<!std::is_void<typename prop::hooks_policy>::value, - typename prop::hooks_policy, - Kokkos::Experimental::DefaultViewHooks>; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - using data_analysis = Kokkos::Impl::ViewDataAnalysis<DataType, ArrayLayout>; - - public: - //------------------------------------ - // Data type traits: - - using data_type = typename data_analysis::type; - using const_data_type = typename data_analysis::const_type; - using non_const_data_type = typename data_analysis::non_const_type; - - //------------------------------------ - // Compatible array of trivial type traits: - - using scalar_array_type = typename data_analysis::scalar_array_type; - using const_scalar_array_type = - typename data_analysis::const_scalar_array_type; - using non_const_scalar_array_type = - typename data_analysis::non_const_scalar_array_type; - - //------------------------------------ - // Value type traits: - - using value_type = typename data_analysis::value_type; - using const_value_type = typename data_analysis::const_value_type; - using non_const_value_type = typename data_analysis::non_const_value_type; - - //------------------------------------ - // Mapping traits: - - using array_layout = ArrayLayout; - using dimension = typename data_analysis::dimension; - - using specialize = std::conditional_t< - std::is_void<typename data_analysis::specialize>::value, - typename prop::specialize, - typename data_analysis::specialize>; /* mapping specialization tag */ - - static constexpr unsigned rank = dimension::rank; - static constexpr unsigned rank_dynamic = dimension::rank_dynamic; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - using execution_space = ExecutionSpace; - using memory_space = MemorySpace; - using device_type = Kokkos::Device<ExecutionSpace, MemorySpace>; - using memory_traits = MemoryTraits; - using host_mirror_space = HostMirrorSpace; - using hooks_policy = HooksPolicy; - - using size_type = typename MemorySpace::size_type; - - enum { is_hostspace = std::is_same<MemorySpace, HostSpace>::value }; - enum { is_managed = MemoryTraits::is_unmanaged == 0 }; - enum { is_random_access = MemoryTraits::is_random_access == 1 }; - - //------------------------------------ -}; - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, <tt>double*</tt> - * indicates a one-dimensional array of \c double with run-time - * dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * <tt>Space</tt>. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on - * Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View<double*, Cuda> out, - * View<const double*, Cuda> in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View<const double*, Cuda, RandomAccess> in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ - -} // namespace Kokkos - -namespace Kokkos { - -template <class T1, class T2> -struct is_always_assignable_impl; - -template <class... ViewTDst, class... ViewTSrc> -struct is_always_assignable_impl<Kokkos::View<ViewTDst...>, - Kokkos::View<ViewTSrc...>> { - using mapping_type = Kokkos::Impl::ViewMapping< - typename Kokkos::View<ViewTDst...>::traits, - typename Kokkos::View<ViewTSrc...>::traits, - typename Kokkos::View<ViewTDst...>::traits::specialize>; - - constexpr static bool value = - mapping_type::is_assignable && - static_cast<int>(Kokkos::View<ViewTDst...>::rank_dynamic) >= - static_cast<int>(Kokkos::View<ViewTSrc...>::rank_dynamic); -}; - -template <class View1, class View2> -using is_always_assignable = is_always_assignable_impl< - std::remove_reference_t<View1>, - std::remove_const_t<std::remove_reference_t<View2>>>; - -template <class T1, class T2> -inline constexpr bool is_always_assignable_v = - is_always_assignable<T1, T2>::value; - -template <class... ViewTDst, class... ViewTSrc> -constexpr bool is_assignable(const Kokkos::View<ViewTDst...>& dst, - const Kokkos::View<ViewTSrc...>& src) { - using DstTraits = typename Kokkos::View<ViewTDst...>::traits; - using SrcTraits = typename Kokkos::View<ViewTSrc...>::traits; - using mapping_type = - Kokkos::Impl::ViewMapping<DstTraits, SrcTraits, - typename DstTraits::specialize>; - - return is_always_assignable_v<Kokkos::View<ViewTDst...>, - Kokkos::View<ViewTSrc...>> || - (mapping_type::is_assignable && - ((DstTraits::dimension::rank_dynamic >= 1) || - (dst.static_extent(0) == src.extent(0))) && - ((DstTraits::dimension::rank_dynamic >= 2) || - (dst.static_extent(1) == src.extent(1))) && - ((DstTraits::dimension::rank_dynamic >= 3) || - (dst.static_extent(2) == src.extent(2))) && - ((DstTraits::dimension::rank_dynamic >= 4) || - (dst.static_extent(3) == src.extent(3))) && - ((DstTraits::dimension::rank_dynamic >= 5) || - (dst.static_extent(4) == src.extent(4))) && - ((DstTraits::dimension::rank_dynamic >= 6) || - (dst.static_extent(5) == src.extent(5))) && - ((DstTraits::dimension::rank_dynamic >= 7) || - (dst.static_extent(6) == src.extent(6))) && - ((DstTraits::dimension::rank_dynamic >= 8) || - (dst.static_extent(7) == src.extent(7)))); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include <impl/Kokkos_ViewMapping.hpp> -#include <impl/Kokkos_ViewArray.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp declare target -#endif - -inline constexpr Kokkos::ALL_t ALL{}; - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp end declare target -#endif - -inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; - -inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory - * alignment - */ -template <class... Args> -inline Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...> -view_alloc(Args const&... args) { - using return_type = - Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...>; - - static_assert(!return_type::has_pointer, - "Cannot give pointer-to-memory for view allocation"); - - return return_type(args...); -} - -template <class... Args> -KOKKOS_INLINE_FUNCTION - Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...> - view_wrap(Args const&... args) { - using return_type = - Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...>; - - static_assert(!return_type::has_memory_space && - !return_type::has_execution_space && - !return_type::has_label && return_type::has_pointer, - "Must only give pointer-to-memory for view wrapping"); - - return return_type(args...); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template <class DataType, class... Properties> -class View; - -template <class> -struct is_view : public std::false_type {}; - -template <class D, class... P> -struct is_view<View<D, P...>> : public std::true_type {}; - -template <class D, class... P> -struct is_view<const View<D, P...>> : public std::true_type {}; - -template <class T> -inline constexpr bool is_view_v = is_view<T>::value; - -template <class DataType, class... Properties> -class View : public ViewTraits<DataType, Properties...> { - private: - template <class, class...> - friend class View; - template <class, class...> - friend class Kokkos::Impl::ViewMapping; - - using view_tracker_type = Kokkos::Impl::ViewTracker<View>; - - public: - using traits = ViewTraits<DataType, Properties...>; - - private: - using map_type = - Kokkos::Impl::ViewMapping<traits, typename traits::specialize>; - template <typename V> - friend struct Kokkos::Impl::ViewTracker; - using hooks_policy = typename traits::hooks_policy; - - view_tracker_type m_track; - map_type m_map; - - public: - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - using array_type = - View<typename traits::scalar_array_type, typename traits::array_layout, - typename traits::device_type, typename traits::hooks_policy, - typename traits::memory_traits>; - - /** \brief Compatible view of const data type */ - using const_type = - View<typename traits::const_data_type, typename traits::array_layout, - typename traits::device_type, typename traits::hooks_policy, - typename traits::memory_traits>; - - /** \brief Compatible view of non-const data type */ - using non_const_type = - View<typename traits::non_const_data_type, typename traits::array_layout, - typename traits::device_type, typename traits::hooks_policy, - typename traits::memory_traits>; - - /** \brief Compatible HostMirror view */ - using HostMirror = - View<typename traits::non_const_data_type, typename traits::array_layout, - Device<DefaultHostExecutionSpace, - typename traits::host_mirror_space::memory_space>, - typename traits::hooks_policy>; - - /** \brief Compatible HostMirror view */ - using host_mirror_type = - View<typename traits::non_const_data_type, typename traits::array_layout, - typename traits::host_mirror_space, typename traits::hooks_policy>; - - /** \brief Unified types */ - using uniform_type = typename Impl::ViewUniformType<View, 0>::type; - using uniform_const_type = - typename Impl::ViewUniformType<View, 0>::const_type; - using uniform_runtime_type = - typename Impl::ViewUniformType<View, 0>::runtime_type; - using uniform_runtime_const_type = - typename Impl::ViewUniformType<View, 0>::runtime_const_type; - using uniform_nomemspace_type = - typename Impl::ViewUniformType<View, 0>::nomemspace_type; - using uniform_const_nomemspace_type = - typename Impl::ViewUniformType<View, 0>::const_nomemspace_type; - using uniform_runtime_nomemspace_type = - typename Impl::ViewUniformType<View, 0>::runtime_nomemspace_type; - using uniform_runtime_const_nomemspace_type = - typename Impl::ViewUniformType<View, 0>::runtime_const_nomemspace_type; - - //---------------------------------------- - // Domain rank and extents - - static constexpr Impl::integral_constant<size_t, traits::dimension::rank> - rank = {}; - static constexpr Impl::integral_constant<size_t, - traits::dimension::rank_dynamic> - rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif - - template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, size_t> - extent(const iType& r) const noexcept { - return m_map.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return map_type::static_extent(r); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, int> - extent_int(const iType& r) const noexcept { - return static_cast<int>(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() - const { - return m_map.layout(); - } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral<iType>::value, size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference<reference_type>::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>& - impl_map() const { - return m_map; - } - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::SharedAllocationTracker& impl_track() const { - return m_track.m_tracker; - } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same<typename traits::array_layout, Kokkos::LayoutLeft>::value; - - static constexpr bool is_layout_right = - std::is_same<typename traits::array_layout, Kokkos::LayoutRight>::value; - - static constexpr bool is_layout_stride = - std::is_same<typename traits::array_layout, Kokkos::LayoutStride>::value; - - static constexpr bool is_default_map = - std::is_void<typename traits::specialize>::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); \ - Kokkos::Impl::view_verify_operator_bounds<typename traits::memory_space>( \ - __VA_ARGS__); - -#else - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); - +#if defined(KOKKOS_ENABLE_IMPL_MDSPAN) && !defined(KOKKOS_COMPILER_INTEL) +#include <View/Kokkos_BasicView.hpp> #endif - template <typename... Is> - static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is), ""); - static_assert(sizeof...(Is) <= 8, ""); - static_assert(Kokkos::Impl::are_integral<Is...>::value, ""); - } - - template <typename... Is> - static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is), ""); - static_assert(Kokkos::Impl::are_integral<Is...>::value, ""); - } - - public: - //------------------------------ - // Rank 1 default map operator() - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0>::value && // - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0>::value && // - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 1 operator[] - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == rank) && Kokkos::Impl::are_integral<I0>::value && !is_default_map), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.reference(i0); - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral<I0>::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral<I0>::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 default map operator() - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1>::value && // - (2 == rank) && is_default_map && is_layout_left && (rank_dynamic == 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1>::value && // - (2 == rank) && is_default_map && is_layout_left && (rank_dynamic != 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1>::value && // - (2 == rank) && is_default_map && is_layout_right && (rank_dynamic == 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1>::value && // - (2 == rank) && is_default_map && is_layout_right && (rank_dynamic != 0)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1>::value && // - (2 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } - - // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which - // have "inlined" versions above - - template <typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<Is...>::value && // - (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; - } - - template <typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<Is...>::value && // - ((0 == rank) || !is_default_map)), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.reference(indices...); - } - - //------------------------------ - // Rank 0 - - template <typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<Is...>::value && (0 == rank)), reference_type> - access(Is... extra) const { - check_access_member_function_valid_args(extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) - return m_map.reference(); - } - - //------------------------------ - // Rank 1 - - template <typename I0, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value && - (1 == rank) && !is_default_map), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.reference(i0); - } - - template <typename I0, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value && - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[i0]; - } - - template <typename I0, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value && - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 - - template <typename I0, typename I1, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value && - (2 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.reference(i0, i1); - } - - template <typename I0, typename I1, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) && - is_default_map && is_layout_left && (rank_dynamic == 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - } - - template <typename I0, typename I1, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) && - is_default_map && is_layout_left && (rank_dynamic != 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } - - template <typename I0, typename I1, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) && - is_default_map && is_layout_right && (rank_dynamic == 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - } - - template <typename I0, typename I1, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) && - is_default_map && is_layout_right && (rank_dynamic != 0)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } - - template <typename I0, typename I1, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value && - (2 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } - - //------------------------------ - // Rank 3 - - template <typename I0, typename I1, typename I2, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value && - (3 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; - } - - template <typename I0, typename I1, typename I2, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value && - (3 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.reference(i0, i1, i2); - } - - //------------------------------ - // Rank 4 - - template <typename I0, typename I1, typename I2, typename I3, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == rank) && - is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == rank) && - !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.reference(i0, i1, i2, i3); - } - - //------------------------------ - // Rank 5 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value && - (5 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value && - (5 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.reference(i0, i1, i2, i3, i4); - } - - //------------------------------ - // Rank 6 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value && - (6 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value && - (6 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - //------------------------------ - // Rank 7 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value && - (7 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value && - (7 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - //------------------------------ - // Rank 8 - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename I7, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, - I7, Is...>::value && - (8 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map - .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename I7, typename... Is> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, - I7, Is...>::value && - (8 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_DEFAULTED_FUNCTION - ~View() = default; - - KOKKOS_DEFAULTED_FUNCTION - View() = default; - - KOKKOS_FUNCTION - View(const View& other) : m_track(other.m_track), m_map(other.m_map) { - KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View(View&& other) - : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { - KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View& operator=(const View& other) { - m_map = other.m_map; - m_track = other.m_track; - - KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) - - return *this; - } - - KOKKOS_FUNCTION - View& operator=(View&& other) { - m_map = std::move(other.m_map); - m_track = std::move(other.m_track); - - KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) - - return *this; - } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template <class RT, class... RP> - KOKKOS_INLINE_FUNCTION View( - const View<RT, RP...>& rhs, - std::enable_if_t<Kokkos::Impl::ViewMapping< - traits, typename View<RT, RP...>::traits, - typename traits::specialize>::is_assignable_data_type>* = nullptr) - : m_track(rhs), m_map() { - using SrcTraits = typename View<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, - typename traits::specialize>; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - } - - template <class RT, class... RP> - KOKKOS_INLINE_FUNCTION std::enable_if_t< - Kokkos::Impl::ViewMapping< - traits, typename View<RT, RP...>::traits, - typename traits::specialize>::is_assignable_data_type, - View>& - operator=(const View<RT, RP...>& rhs) { - using SrcTraits = typename View<RT, RP...>::traits; - using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, - typename traits::specialize>; - static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - m_track.assign(rhs); - return *this; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template <class RT, class... RP, class Arg0, class... Args> - KOKKOS_INLINE_FUNCTION View(const View<RT, RP...>& src_view, const Arg0 arg0, - Args... args) - : m_track(src_view), m_map() { - using SrcType = View<RT, RP...>; - - using Mapping = Kokkos::Impl::ViewMapping<void, typename SrcType::traits, - Arg0, Args...>; - - using DstType = typename Mapping::type; - - static_assert( - Kokkos::Impl::ViewMapping<traits, typename DstType::traits, - typename traits::specialize>::is_assignable, - "Subview construction requires compatible view and subview arguments"); - - Mapping::assign(m_map, src_view.m_map, arg0, args...); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.m_tracker.use_count(); } - - inline const std::string label() const { - return m_track.m_tracker - .template get_label<typename traits::memory_space>(); - } - - public: - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template <class... P> - explicit inline View( - const Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), m_map() { - // Copy the input allocation properties with possibly defaulted properties - // We need to split it in two to avoid MSVC compiler errors - auto prop_copy_tmp = - Impl::with_properties_if_unset(arg_prop, std::string{}); - auto prop_copy = Impl::with_properties_if_unset( - prop_copy_tmp, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing View and initializing data with uninitialized " - "execution space"); - } - - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property<Impl::LabelTag>(prop_copy); - Impl::runtime_check_rank( - rank, rank_dynamic, - std::is_same<typename traits::specialize, void>::value, i0, i1, i2, i3, - i4, i5, i6, i7, alloc_name); - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, Impl::ViewCtorProp<P...>::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.m_tracker.assign_allocated_record_to_uninitialized(record); - } - - KOKKOS_INLINE_FUNCTION - void assign_data(pointer_type arg_data) { - m_track.m_tracker.clear(); - m_map.assign_data(arg_data); - } - - // Wrap memory according to properties and array layout - template <class... P> - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { - static_assert( - std::is_same<pointer_type, - typename Impl::ViewCtorProp<P...>::pointer_type>::value, - "Constructing View to wrap user memory must supply matching pointer " - "type"); - } - - // Simple dimension-only layout - template <class... P> - explicit inline View( - const Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - template <class... P> - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp<P...>& arg_prop, - std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Allocate with label and layout - template <typename Label> - explicit inline View( - const Label& arg_label, - std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value, - typename traits::array_layout> const& arg_layout) - : View(Impl::ViewCtorProp<std::string>(arg_label), arg_layout) {} - - // Allocate label and layout, must disambiguate from subview constructor. - template <typename Label> - explicit inline View( - const Label& arg_label, - std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp<std::string>(arg_label), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Construct view from ViewTracker and map - // This should be the preferred method because future extensions may need to - // use the ViewTracker class. - template <class Traits> - KOKKOS_INLINE_FUNCTION View( - const view_tracker_type& track, - const Kokkos::Impl::ViewMapping<Traits, typename Traits::specialize>& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping<traits, Traits, typename traits::specialize>; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track.m_tracker); - } - - // Construct View from internal shared allocation tracker object and map - // This is here for backwards compatibility for classes that derive from - // Kokkos::View - template <class Traits> - KOKKOS_INLINE_FUNCTION View( - const typename view_tracker_type::track_type& track, - const Kokkos::Impl::ViewMapping<Traits, typename Traits::specialize>& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping<traits, Traits, typename traits::specialize>; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track); - } - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t required_allocation_size( - typename traits::array_layout const& layout) { - return map_type::memory_span(layout); - } - - static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp<pointer_type>(arg_ptr), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp<pointer_type>(arg_ptr), arg_layout) {} - - //---------------------------------------- - // Shared scratch memory constructor - - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - const size_t num_passed_args = Impl::count_valid_integers( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); - - if (std::is_void<typename traits::specialize>::value && - num_passed_args != rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - - return View::shmem_size(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - private: - // Want to be able to align to minimum scratch alignment or sizeof or alignof - // elements - static constexpr size_t scratch_value_alignment = - max({sizeof(typename traits::value_type), - alignof(typename traits::value_type), - static_cast<size_t>( - traits::execution_space::scratch_memory_space::ALIGN)}); - - public: - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + scratch_value_alignment; - } - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp<pointer_type>(reinterpret_cast<pointer_type>( - arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), - scratch_value_alignment))), - arg_layout) {} - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp<pointer_type>( - reinterpret_cast<pointer_type>(arg_space.get_shmem_aligned( - map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7)), - scratch_value_alignment))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } -}; - -template <typename D, class... P> -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View<D, P...>&) { - return View<D, P...>::rank(); -} - -namespace Impl { - -template <typename ValueType, unsigned int Rank> -struct RankDataType { - using type = typename RankDataType<ValueType, Rank - 1>::type*; -}; - -template <typename ValueType> -struct RankDataType<ValueType, 0> { - using type = ValueType; -}; - -template <unsigned N, typename... Args> -KOKKOS_FUNCTION std::enable_if_t< - N == View<Args...>::rank() && - std::is_same<typename ViewTraits<Args...>::specialize, void>::value, - View<Args...>> -as_view_of_rank_n(View<Args...> v) { - return v; -} - -// Placeholder implementation to compile generic code for DynRankView; should -// never be called -template <unsigned N, typename T, typename... Args> -KOKKOS_FUNCTION std::enable_if_t< - N != View<T, Args...>::rank() && - std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value, - View<typename RankDataType<typename View<T, Args...>::value_type, N>::type, - Args...>> -as_view_of_rank_n(View<T, Args...>) { - Kokkos::abort("Trying to get at a View of the wrong rank"); - return {}; -} - -template <typename Function, typename... Args> -void apply_to_view_of_static_rank(Function&& f, View<Args...> a) { - f(a); -} - -} // namespace Impl -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Impl { -template <class ValueType, class TypeList> -struct TypeListToViewTraits; - -template <class ValueType, class... Properties> -struct TypeListToViewTraits<ValueType, Kokkos::Impl::type_list<Properties...>> { - using type = ViewTraits<ValueType, Properties...>; -}; - -// It is not safe to assume that subviews of views with the Aligned memory trait -// are also aligned. Hence, just remove that attribute for subviews. -template <class D, class... P> -struct RemoveAlignedMemoryTrait { - private: - using type_list_in = Kokkos::Impl::type_list<P...>; - using memory_traits = typename ViewTraits<D, P...>::memory_traits; - using type_list_in_wo_memory_traits = - typename Kokkos::Impl::type_list_remove_first<memory_traits, - type_list_in>::type; - using new_memory_traits = - Kokkos::MemoryTraits<memory_traits::impl_value & ~Kokkos::Aligned>; - using new_type_list = typename Kokkos::Impl::concat_type_list< - type_list_in_wo_memory_traits, - Kokkos::Impl::type_list<new_memory_traits>>::type; - - public: - using type = typename TypeListToViewTraits<D, new_type_list>::type; -}; -} // namespace Impl - -template <class D, class... P, class... Args> -KOKKOS_INLINE_FUNCTION auto subview(const View<D, P...>& src, Args... args) { - static_assert(View<D, P...>::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait<D, P...>::type, - Args...>::type(src, args...); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template <class MemoryTraits, class D, class... P, class... Args> -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View<D, P...>& src, - Args... args) { - static_assert(View<D, P...>::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits<MemoryTraits>::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait<D, P..., MemoryTraits>::type, - Args...>::type(src, args...); -} -#endif - -template <class V, class... Args> -using Subview = decltype(subview(std::declval<V>(), std::declval<Args>()...)); - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template <class LT, class... LP, class RT, class... RP> -KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs, - const View<RT, RP...>& rhs) { - // Same data, layout, dimensions - using lhs_traits = ViewTraits<LT, LP...>; - using rhs_traits = ViewTraits<RT, RP...>; - - return std::is_same<typename lhs_traits::const_value_type, - typename rhs_traits::const_value_type>::value && - std::is_same<typename lhs_traits::array_layout, - typename rhs_traits::array_layout>::value && - std::is_same<typename lhs_traits::memory_space, - typename rhs_traits::memory_space>::value && - View<LT, LP...>::rank() == View<RT, RP...>::rank() && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); -} - -template <class LT, class... LP, class RT, class... RP> -KOKKOS_INLINE_FUNCTION bool operator!=(const View<LT, LP...>& lhs, - const View<RT, RP...>& rhs) { - return !(operator==(lhs, rhs)); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -inline void shared_allocation_tracking_disable() { - Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_disable(); -} - -inline void shared_allocation_tracking_enable() { - Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enable(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <class Specialize, typename A, typename B> -struct CommonViewValueType; - -template <typename A, typename B> -struct CommonViewValueType<void, A, B> { - using value_type = std::common_type_t<A, B>; -}; - -template <class Specialize, class ValueType> -struct CommonViewAllocProp; - -template <class ValueType> -struct CommonViewAllocProp<void, ValueType> { - using value_type = ValueType; - using scalar_array_type = ValueType; - - template <class... Views> - KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} -}; - -template <class... Views> -struct DeduceCommonViewAllocProp; - -// Base case must provide types for: -// 1. specialize 2. value_type 3. is_view 4. prop_type -template <class FirstView> -struct DeduceCommonViewAllocProp<FirstView> { - using specialize = typename FirstView::traits::specialize; - - using value_type = typename FirstView::traits::value_type; - - enum : bool { is_view = is_view<FirstView>::value }; - - using prop_type = CommonViewAllocProp<specialize, value_type>; -}; - -template <class FirstView, class... NextViews> -struct DeduceCommonViewAllocProp<FirstView, NextViews...> { - using NextTraits = DeduceCommonViewAllocProp<NextViews...>; - - using first_specialize = typename FirstView::traits::specialize; - using first_value_type = typename FirstView::traits::value_type; - - enum : bool { first_is_view = is_view<FirstView>::value }; - - using next_specialize = typename NextTraits::specialize; - using next_value_type = typename NextTraits::value_type; - - enum : bool { next_is_view = NextTraits::is_view }; - - // common types - - // determine specialize type - // if first and next specialize differ, but are not the same specialize, error - // out - static_assert(!(!std::is_same<first_specialize, next_specialize>::value && - !std::is_void<first_specialize>::value && - !std::is_void<next_specialize>::value), - "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " - "specialize trait allowed"); - - // otherwise choose non-void specialize if either/both are non-void - using specialize = std::conditional_t< - std::is_same<first_specialize, next_specialize>::value, first_specialize, - std::conditional_t<(std::is_void<first_specialize>::value && - !std::is_void<next_specialize>::value), - next_specialize, first_specialize>>; - - using value_type = typename CommonViewValueType<specialize, first_value_type, - next_value_type>::value_type; - - enum : bool { is_view = (first_is_view && next_is_view) }; - - using prop_type = CommonViewAllocProp<specialize, value_type>; -}; - -} // end namespace Impl - -template <class... Views> -using DeducedCommonPropsType = - typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type; - -// This function is required in certain scenarios where users customize -// Kokkos View internals. One example are dynamic length embedded ensemble -// types. The function is used to propagate necessary information -// (like the ensemble size) when creating new views. -// However, most of the time it is called with a single view. -// Furthermore, the propagated information is not just for view allocations. -// From what I can tell, the type of functionality provided by -// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, -// a mechanism we will eventually use to replace this clunky approach here, when -// we are finally mdspan based. -// TODO: get rid of this when we have mdspan -template <class... Views> -KOKKOS_INLINE_FUNCTION DeducedCommonPropsType<Views...> common_view_alloc_prop( - Views const&... views) { - return DeducedCommonPropsType<Views...>(views...); -} - -} // namespace Kokkos - -#include <impl/Kokkos_ViewUniformType.hpp> -#include <impl/Kokkos_Atomic_View.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +#include <View/Kokkos_ViewLegacy.hpp> -#endif /* #ifndef KOKKOS_VIEW_HPP */ +#endif /* KOKKOS_VIEW_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index efa56a086e36ceafae0ce05b22b30b49c7d41d2c..4d2263428154457b4b4ae9a6b7e06c645e0599fb 100644 --- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -120,7 +120,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { (std::int32_t)BEGIN_TOKEN))) { // Attempt to claim ready work index succeeded, // update the hint and return work index - atomic_increment(begin_hint); + atomic_inc(begin_hint); return w; } // arrive here when ready_queue[i] == BEGIN_TOKEN @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { void operator()(const TagCount, int i) const noexcept { std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; - atomic_increment(count_queue + m_graph.entries[i]); + atomic_inc(count_queue + m_graph.entries[i]); } KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index f54c44d66f01b6f13db5aecbb6d7ab783650aed3..37fcfb7a1d994e551069b241ed2fa4f837b17100 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -23,7 +23,19 @@ #include <impl/Kokkos_ExecSpaceManager.hpp> #include <impl/Kokkos_DeviceManagement.hpp> +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +#include <cuda_runtime.h> +#elif defined(KOKKOS_ARCH_AMD_GPU) +// FIXME_OPENACC - hip_runtime_api.h contains two implementations: one for AMD +// GPUs and the other for NVIDIA GPUs; below macro is needed to choose AMD GPUs. +#define __HIP_PLATFORM_AMD__ +#include <hip/hip_runtime_api.h> +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) +#include <thread> +#endif + #include <iostream> +#include <sstream> Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( @@ -46,6 +58,8 @@ Kokkos::Experimental::OpenACC::OpenACC(int async_arg) void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { + Impl::OpenACCInternal::m_concurrency = + 256000; // FIXME_OPENACC - random guess when cannot compute if (Impl::OpenACC_Traits::may_fallback_to_host && acc_get_num_devices(Impl::OpenACC_Traits::dev_type) == 0 && !settings.has_device_id()) { @@ -58,10 +72,47 @@ void Kokkos::Experimental::OpenACC::impl_initialize( Impl::OpenACCInternal::m_acc_device_num = acc_get_device_num(acc_device_host); } else { + using Kokkos::Impl::get_visible_devices; + acc_set_device_type(Impl::OpenACC_Traits::dev_type); + std::vector<int> const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - int const dev_num = get_gpu(settings); + int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + cudaDeviceProp deviceProp; + cudaError error = cudaGetDeviceProperties(&deviceProp, dev_num); + if (error != cudaSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "CUDA device properties: (" << cudaGetErrorName(error) + << "): " << cudaGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ARCH_AMD_GPU) + hipDeviceProp_t deviceProp; + hipError_t error = hipGetDeviceProperties(&deviceProp, dev_num); + if (error != hipSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "HIP device properties: (" << hipGetErrorName(error) + << "): " << hipGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + Impl::OpenACCInternal::m_concurrency = std::thread::hardware_concurrency(); + if (Impl::OpenACCInternal::m_concurrency == 0) { + Kokkos::Impl::host_abort( + "Error: During OpenACC backend initialization, failed to retrieve " + "CPU hardware concurrency"); + } +#else + // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. +#endif } Impl::OpenACCInternal::singleton().initialize(); } @@ -84,6 +135,12 @@ void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, os << "yes\n"; #else os << "no\n"; +#endif + os << " KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE: "; +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + os << "yes\n"; +#else + os << "no\n"; #endif m_space_instance->print_configuration(os, verbose); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index b012f6a42a41ace7e502bd884f682c2189e5a2a6..aee696bd34e62518793added4fe6310bb658b2a4 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -42,6 +42,7 @@ static_assert(false, // LLVM/Clacc compiler does not need this. #ifndef KOKKOS_COMPILER_CLANG #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS #endif namespace Kokkos::Experimental::Impl { @@ -87,11 +88,15 @@ class OpenACC { static char const* name() { return "OpenACC"; } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 256000; } // FIXME_OPENACC + static int concurrency(); #else - int concurrency() const { return 256000; } // FIXME_OPENACC + int concurrency() const; +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool in_parallel() { + return acc_on_device(acc_device_not_host); + } #endif - static bool in_parallel() { return acc_on_device(acc_device_not_host); } uint32_t impl_instance_id() const noexcept; Impl::OpenACCInternal* impl_internal_space_instance() const { return m_space_instance.get(); diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 141ec77fd1f01187b61b4cb60041714654580632..c8a5d28ba83f2a525753265949710eb87ebe2b56 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -19,8 +19,8 @@ #include <OpenACC/Kokkos_OpenACC.hpp> #include <OpenACC/Kokkos_OpenACCSpace.hpp> #include <OpenACC/Kokkos_OpenACC_DeepCopy.hpp> -#include <impl/Kokkos_MemorySpace.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> +#include <impl/Kokkos_Error.hpp> #include <openacc.h> @@ -66,6 +66,10 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( ptr = acc_malloc(arg_alloc_size); + if (!ptr) { + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp index 4e7170cbbdf350e05e6a36ae77ab2f683d367076..75cef98a8d91b7c5853f0e9ea69ecdc6e0b51020 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp @@ -85,16 +85,26 @@ class OpenACCSpace { template <> struct Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace> { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; template <> struct Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::OpenACCSpace, Kokkos::HostSpace> { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp index 4aed7e00f7651eeeba9b2ab9d87c7ccbdb766488..ca022192b0bc5fd3203f3e464604aeb6262b5496 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp @@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace, // value checking is added as a safeguard. (The current NVHPC (V22.5) // supports OpenACC V2.7.) if (n > 0) { - acc_memcpy_device(dst, const_cast<void*>(src), n); + acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -52,7 +52,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_device(dst, const_cast<void*>(src), n); + acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -60,7 +60,7 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace, "Kokkos::Impl::DeepCopy<OpenACCSpace, OpenACCSpace, " "ExecutionSpace>::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_device(dst, const_cast<void*>(src), n); + acc_memcpy_device_async(dst, const_cast<void*>(src), n, acc_async_noval); } } }; @@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace, Kokkos::HostSpace, Kokkos::Experimental::OpenACC> { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_to_device(dst, const_cast<void*>(src), n); + if (n > 0) + acc_memcpy_to_device_async(dst, const_cast<void*>(src), n, + acc_async_noval); } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, const void* src, size_t n) { @@ -85,7 +87,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace, Kokkos::HostSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_to_device(dst, const_cast<void*>(src), n); + acc_memcpy_to_device_async(dst, const_cast<void*>(src), n, + acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -93,7 +96,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::Experimental::OpenACCSpace, "Kokkos::Impl::DeepCopy<OpenACCSpace, HostSpace, " "ExecutionSpace>::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_to_device(dst, const_cast<void*>(src), n); + acc_memcpy_to_device_async(dst, const_cast<void*>(src), n, + acc_async_noval); } } }; @@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::OpenACC> { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_from_device(dst, const_cast<void*>(src), n); + acc_memcpy_from_device_async(dst, const_cast<void*>(src), n, + acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -120,14 +125,17 @@ template <class ExecutionSpace> struct Kokkos::Impl::DeepCopy< Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_from_device(dst, const_cast<void*>(src), n); + if (n > 0) + acc_memcpy_from_device_async(dst, const_cast<void*>(src), n, + acc_async_noval); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy<HostSpace, OpenACCSpace, " "ExecutionSpace>::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_from_device(dst, const_cast<void*>(src), n); + acc_memcpy_from_device_async(dst, const_cast<void*>(src), n, + acc_async_noval); } } }; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 82d38586eb8fe35dea59925eafdf02a09ec1144b..1373f8fa7a48faa0c814db8a7be3be00ed9ea517 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -38,7 +38,7 @@ class FunctorAdapter; \ KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE) \ template <class... Args> \ - KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + KOKKOS_FUNCTION void operator()(Args &&...args) const { \ if constexpr (std::is_void_v<WorkTag>) { \ m_functor(static_cast<Args &&>(args)...); \ } else { \ diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 10a76fbd31361b4a5c4265439409e1f929384efb..1dad499c1bec5acd95b7316c52f568365ac7be13 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -27,6 +27,7 @@ // Arbitrary value to denote that we don't know yet what device to use. int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; +int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; Kokkos::Experimental::Impl::OpenACCInternal& Kokkos::Experimental::Impl::OpenACCInternal::singleton() { @@ -78,8 +79,18 @@ void Kokkos::Experimental::Impl::OpenACCInternal::fence( [&]() { acc_wait(m_async_arg); }); } -uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const - noexcept { +uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() + const noexcept { return Kokkos::Tools::Experimental::Impl::idForInstance<OpenACC>( reinterpret_cast<uintptr_t>(this)); } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +int Kokkos::Experimental::OpenACC::concurrency() { + return Impl::OpenACCInternal::m_concurrency; +} +#else +int Kokkos::Experimental::OpenACC::concurrency() const { + return Impl::OpenACCInternal::m_concurrency; +} +#endif diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index 6645616ba51984ce917d543481bb09782fa8e5f9..343d9921a95a0864c7e77bbb81bdabb101d75741 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -30,12 +30,13 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { bool m_is_initialized = false; - OpenACCInternal(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = default; OpenACCInternal& operator=(const OpenACCInternal&) = default; public: static int m_acc_device_num; - int m_async_arg = acc_async_sync; + static int m_concurrency; + int m_async_arg = acc_async_noval; OpenACCInternal() = default; @@ -43,7 +44,7 @@ class OpenACCInternal { bool verify_is_initialized(const char* const label) const; - void initialize(int async_arg = acc_async_sync); + void initialize(int async_arg = acc_async_noval); void finalize(); bool is_initialized() const; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index 550436fe7beceba231454017eb709f27b1b9aa7e..629d26928ed3aad94c008fe587a91cdd27a1cbfb 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -30,10 +30,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i1 = m / dim0 + begin1; + auto i0 = m % dim0 + begin0; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -42,6 +55,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, functor(i0, i1); } } +#endif } template <class Functor> @@ -50,10 +64,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i0 = m / dim1 + begin0; + auto i1 = m % dim1 + begin1; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -62,6 +89,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, functor(i0, i1); } } +#endif } template <class Functor> @@ -71,12 +99,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1) copyin(functor) async(async_arg) // clang-format on @@ -94,12 +122,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; // clang-format off #pragma acc parallel loop gang vector tile(tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -116,12 +144,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim1 * dim0; + auto i2 = m / tmp1 + begin2; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -132,6 +177,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template <class Functor> @@ -140,12 +186,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim2 + begin1; + auto i2 = tmp2 % dim2 + begin2; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -156,6 +219,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template <class Functor> @@ -165,15 +229,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2) copyin(functor) async(async_arg) // clang-format on @@ -193,15 +257,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; // clang-format off #pragma acc parallel loop gang vector tile(tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -220,14 +284,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1 * dim0; + auto i3 = m / tmp1 + begin3; + auto tmp2 = m % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -240,6 +325,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template <class Functor> @@ -248,14 +334,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + auto i2 = tmp2 / dim3 + begin2; + auto i3 = tmp2 % dim3 + begin3; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -268,6 +375,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template <class Functor> @@ -277,18 +385,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3) copyin(functor) async(async_arg) // clang-format on @@ -310,18 +418,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; // clang-format off #pragma acc parallel loop gang vector tile(tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -342,16 +450,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = m / tmp1 + begin4; + auto tmp2 = m % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -366,6 +499,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template <class Functor> @@ -374,16 +508,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i3 = tmp2 / dim4 + begin3; + auto i4 = tmp2 % dim4 + begin4; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -398,6 +557,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template <class Functor> @@ -407,21 +567,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4) copyin(functor) async(async_arg) // clang-format on @@ -445,21 +605,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; // clang-format off #pragma acc parallel loop gang vector tile(tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -482,18 +642,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; + auto i5 = m / tmp1 + begin5; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = tmp2 / tmp1 + begin4; + tmp2 = tmp2 % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -510,6 +699,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template <class Functor> @@ -518,18 +708,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim5 * dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + auto i4 = tmp2 / dim5 + begin4; + auto i5 = tmp2 % dim5 + begin5; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -546,6 +765,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template <class Functor> @@ -555,24 +775,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int tile5 = tile[5]; - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto tile5 = tile[5]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4,tile5) copyin(functor) async(async_arg) // clang-format on @@ -598,24 +818,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile5 = tile[5]; - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto tile5 = tile[5]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; // clang-format off #pragma acc parallel loop gang vector tile(tile5,tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp index 4fce680aef09b45e4b91c6f703bd857bd975e4c3..2b98018e3bb9c7fa3c126a69ccf7d1c908f80266 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_Team.hpp @@ -44,10 +44,12 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang vector num_gangs(league_size) \ - vector_length(team_size* vector_length) copyin(a_functor) + vector_length(team_size* vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size * team_size * vector_length; i++) { int league_id = i / (team_size * vector_length); typename Policy::member_type team(league_id, league_size, team_size, @@ -145,10 +147,12 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, auto team_size = m_policy.team_size(); auto vector_length = m_policy.impl_vector_length(); + int const async_arg = m_policy.space().acc_async_queue(); + auto const a_functor(m_functor); #pragma acc parallel loop gang num_gangs(league_size) num_workers(team_size) \ - vector_length(vector_length) copyin(a_functor) + vector_length(vector_length) copyin(a_functor) async(async_arg) for (int i = 0; i < league_size; i++) { int league_id = i; typename Policy::member_type team(league_id, league_size, team_size, diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 2c7793dc11650de1d70de7c2516e5d4c7c3ae50b..2b5631d6f8a35826d24499e077c8e2292ab333a5 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -31,7 +31,7 @@ template <class Functor, class Reducer, class Policy, struct OpenACCParallelReduceMDRangeHelper { OpenACCParallelReduceMDRangeHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true<Functor>::value, + static_assert(Kokkos::Impl::always_false<Functor>::value, "not implemented"); } }; @@ -113,6 +113,404 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } }; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + +#define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ + OPERATOR) \ + namespace Kokkos::Experimental::Impl { \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i1 = m / dim0 + begin1; \ + auto i0 = m % dim0 + begin0; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i0 = m / dim1 + begin0; \ + auto i1 = m % dim1 + begin1; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim1 * dim0; \ + auto i2 = m / tmp1 + begin2; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim2 + begin1; \ + auto i2 = tmp2 % dim2 + begin2; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1 * dim0; \ + auto i3 = m / tmp1 + begin3; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + auto i2 = tmp2 / dim3 + begin2; \ + auto i3 = tmp2 % dim3 + begin3; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = m / tmp1 + begin4; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i3 = tmp2 / dim4 + begin3; \ + auto i4 = tmp2 % dim4 + begin4; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; \ + auto i5 = m / tmp1 + begin5; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = tmp2 / tmp1 + begin4; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template <class ValueType, class Functor> \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim5 * dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + auto i4 = tmp2 / dim5 + begin4; \ + auto i5 = tmp2 % dim5 + begin5; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#else + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ OPERATOR) \ namespace Kokkos::Experimental::Impl { \ @@ -124,10 +522,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -136,6 +534,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, functor(i0, i1, val); \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -147,10 +546,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(2) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -159,6 +558,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, functor(i0, i1, val); \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -170,12 +570,12 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(3) reduction( \ OPERATOR \ @@ -188,6 +588,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -199,12 +600,12 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(3) reduction( \ OPERATOR \ @@ -217,6 +618,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -228,14 +630,14 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin3 = begin[3]; \ - int end3 = end[3]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -248,6 +650,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -259,14 +662,14 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin3 = begin[3]; \ - int end3 = end[3]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(4) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -279,6 +682,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -290,16 +694,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin4 = begin[4]; \ - int end4 = end[4]; \ - int begin3 = begin[3]; \ - int end3 = end[3]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -314,6 +718,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -325,16 +730,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin3 = begin[3]; \ - int end3 = end[3]; \ - int begin4 = begin[4]; \ - int end4 = end[4]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(5) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -349,6 +754,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -360,18 +766,18 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin5 = begin[5]; \ - int end5 = end[5]; \ - int begin4 = begin[4]; \ - int end4 = end[4]; \ - int begin3 = begin[3]; \ - int end3 = end[3]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -388,6 +794,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -399,18 +806,18 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, int async_arg) { \ auto val = aval; \ auto const functor(afunctor); \ - int begin0 = begin[0]; \ - int end0 = end[0]; \ - int begin1 = begin[1]; \ - int end1 = end[1]; \ - int begin2 = begin[2]; \ - int end2 = end[2]; \ - int begin3 = begin[3]; \ - int end3 = end[3]; \ - int begin4 = begin[4]; \ - int end4 = end[4]; \ - int begin5 = begin[5]; \ - int end5 = end[5]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ /* clang-format off */ \ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector collapse(6) reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ /* clang-format on */ \ @@ -427,10 +834,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ } // namespace Kokkos::Experimental::Impl +#endif + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_MDRANGE_HELPER(REDUCER, OPERATOR) \ KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, OPERATOR) \ template <class Functor, class Scalar, class Space, class... Traits> \ diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index b61a05a8ee1b20581670ee460d703e74b8804137..f9039e3bb4cd3b6461c3c8a4729ef3c8655effc2 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -31,7 +31,7 @@ template <class Functor, class Reducer, class Policy, bool = std::is_arithmetic_v<typename Reducer::value_type>> struct OpenACCParallelReduceHelper { OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true<Functor>::value, + static_assert(Kokkos::Impl::always_false<Functor>::value, "not implemented"); } }; @@ -140,6 +140,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, functor(i, val); \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ \ @@ -169,6 +170,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, functor(i, val); \ } \ } \ + acc_wait(async_arg); \ aval = val; \ } \ } // namespace Kokkos::Experimental::Impl diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 3223ce3f9afa36ba6ebf5e529686bf3ef8a5d6f0..d4cb73164d20f5ae17c41e6d161b1427dc34207e 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -40,7 +40,7 @@ template <class Functor, class Reducer, class Policy, struct OpenACCParallelReduceTeamHelper { OpenACCParallelReduceTeamHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true<Functor>::value, + static_assert(Kokkos::Impl::always_false<Functor>::value, "not implemented"); } }; @@ -129,7 +129,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true<Lambda>::value, + static_assert(Kokkos::Impl::always_false<Lambda>::value, "custom reduction is not implemented"); } @@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true<Lambda>::value, + static_assert(Kokkos::Impl::always_false<Lambda>::value, "custom reduction is not implemented"); } @@ -163,13 +163,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + wrapped_reducer.final(&tmp); result = tmp; } } @@ -180,15 +191,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer_v<ReducerType>> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -200,7 +221,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -208,6 +239,7 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -218,9 +250,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer_v<ReducerType>> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -228,6 +268,8 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -239,7 +281,17 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -247,6 +299,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -273,10 +326,23 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + ValueType tmp = ValueType(); #pragma acc loop worker reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); result = tmp; } @@ -314,11 +380,22 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer_v<ValueType>> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } @@ -357,11 +434,23 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::OpenACCTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); result = tmp; } @@ -394,6 +483,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( vector_length); \ functor(team, val); \ } \ + acc_wait(async_arg); \ aval = val; \ } \ } // namespace Kokkos::Experimental::Impl diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index c6d3267bdb0a0ec53ef540d8d383e645460373f4..b1c48baa1e7366d9e4f6d5843610a6d9a7acdad9 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -225,7 +225,7 @@ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ } #pragma acc exit data delete (functor, chunk_values, offset_values, \ - final_reducer)async(async_arg) + final_reducer)async(async_arg) acc_wait(async_arg); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp index 91faa64f733380801b9f58770454be1997087288..76e1514476a05043c7411a33de857076828f2263 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp @@ -16,92 +16,11 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE -#include <OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp> +#include <OpenACC/Kokkos_OpenACC.hpp> #include <OpenACC/Kokkos_OpenACC_DeepCopy.hpp> -#include <impl/Kokkos_MemorySpace.hpp> -#include <Kokkos_HostSpace.hpp> - -#ifdef KOKKOS_ENABLE_DEBUG -Kokkos::Impl::SharedAllocationRecord<void, void> SharedAllocationRecord< - Kokkos::Experimental::OpenACCSpace, void>::s_root_record; -#endif - -Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, - void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - (SharedAllocationRecord<void, void>::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy<Experimental::OpenACCSpace, HostSpace>( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, " - "void>::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC &arg_exec_space, - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy<Experimental::OpenACCSpace, HostSpace>( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -//============================================================================== -// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 +#include <OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp> #include <impl/Kokkos_SharedAlloc_timpl.hpp> -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicitly instantiate these CRTP base classes -// here, where we have access to the associated *_timpl.hpp header files. -template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; -template class Kokkos::Impl::SharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - -// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenACCSpace); diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp index cf83a5b27bcb2b5f3b32940ee18811a0dca8af4d..cde5ecdcb778520cc0a00f81b56171b5105ee0c7 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp @@ -20,55 +20,7 @@ #include <OpenACC/Kokkos_OpenACCSpace.hpp> #include <impl/Kokkos_SharedAlloc.hpp> -#include <openacc.h> - -template <> -class Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::OpenACCSpace, - void> - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - friend class SharedAllocationRecordCommon<Kokkos::Experimental::OpenACCSpace>; - friend Kokkos::Experimental::OpenACCSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenACCSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenACCSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC& exec_space, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); -}; +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenACCSpace); #endif diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index 4ec71f56ef662fb3a4cc7488bc28a65957eedb76..20ea392452b7c242d7e583305bb83ee1e29ab106 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template <class ValueType> KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(!Kokkos::Impl::always_true<ValueType>::value, + static_assert(Kokkos::Impl::always_false<ValueType>::value, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template <class ValueType, class JoinOp> KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(!Kokkos::Impl::always_true<ValueType>::value, + static_assert(Kokkos::Impl::always_false<ValueType>::value, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - !Kokkos::Impl::always_true<ArgType>::value, + Kokkos::Impl::always_false<ArgType>::value, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index faa50aa7c388eb247913d12a9c6f2c8d676d4e05..95526aa7849cceb4901663458b46ba4d745e8373 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -28,8 +28,11 @@ struct OpenACC_Traits { #elif defined(KOKKOS_ARCH_AMD_GPU) static constexpr acc_device_t dev_type = acc_device_radeon; static constexpr bool may_fallback_to_host = false; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + static constexpr acc_device_t dev_type = acc_device_host; + static constexpr bool may_fallback_to_host = true; #else - static constexpr acc_device_t dev_type = acc_device_not_host; + static constexpr acc_device_t dev_type = acc_device_default; static constexpr bool may_fallback_to_host = true; #endif }; diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp index 9a169a435c7380a226b7f157ad399ac2a3340665..0f65ba43a0d5c52cb87cc99abea8eeca58594173 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -72,42 +72,48 @@ int OpenMP::concurrency(OpenMP const &instance) { int OpenMP::concurrency() const { return impl_thread_pool_size(); } #endif +void OpenMP::impl_static_fence(std::string const &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + []() { + std::lock_guard<std::mutex> lock_all_instances( + Impl::OpenMPInternal::all_instances_mutex); + for (auto *instance_ptr : Impl::OpenMPInternal::all_instances) { + std::lock_guard<std::mutex> lock_instance( + instance_ptr->m_instance_mutex); + } + }); +} + void OpenMP::fence(const std::string &name) const { Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {}); + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + [this]() { + auto *internal_instance = this->impl_internal_space_instance(); + std::lock_guard<std::mutex> lock(internal_instance->m_instance_mutex); + }); } bool OpenMP::impl_is_initialized() noexcept { return Impl::OpenMPInternal::singleton().is_initialized(); } -bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return ( - (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && - (!Impl::t_openmp_instance || - Impl::t_openmp_instance->m_level < omp_get_level())); -#else +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -#endif } +#endif int OpenMP::impl_thread_pool_size() const noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(*this) - ? omp_get_num_threads() - : (Impl::t_openmp_instance - ? Impl::t_openmp_instance->m_pool_size - : impl_internal_space_instance()->m_pool_size); -#else - return OpenMP::in_parallel(*this) + return (impl_internal_space_instance()->get_level() < omp_get_level()) ? omp_get_num_threads() : impl_internal_space_instance()->m_pool_size; -#endif } int OpenMP::impl_max_hardware_threads() noexcept { - return Impl::g_openmp_hardware_max_threads; + return Impl::OpenMPInternal::max_hardware_threads(); } namespace Impl { diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index 594f40d5245a48d3665db56d1fd432982cb1c8a8..aa4be87ceb621f13a0e2f9ee646be81580766349 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -27,17 +27,9 @@ static_assert(false, #include <Kokkos_Core_fwd.hpp> -#include <cstddef> -#include <iosfwd> #include <Kokkos_HostSpace.hpp> - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include <Kokkos_HBWSpace.hpp> -#endif - #include <Kokkos_ScratchSpace.hpp> #include <Kokkos_Parallel.hpp> -#include <Kokkos_TaskScheduler.hpp> #include <Kokkos_Layout.hpp> #include <impl/Kokkos_HostSharedPtr.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> @@ -45,6 +37,8 @@ static_assert(false, #include <omp.h> +#include <cstddef> +#include <iosfwd> #include <vector> /*--------------------------------------------------------------------------*/ @@ -53,11 +47,6 @@ namespace Kokkos { namespace Impl { class OpenMPInternal; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -// FIXME_OPENMP we can remove this after we remove partition_master -inline thread_local OpenMPInternal* t_openmp_instance = nullptr; -#endif } // namespace Impl /// \class OpenMP @@ -67,12 +56,7 @@ class OpenMP { //! Tag this class as a kokkos execution space using execution_space = OpenMP; - using memory_space = -#ifdef KOKKOS_ENABLE_HBWSPACE - Experimental::HBWSpace; -#else - HostSpace; -#endif + using memory_space = HostSpace; //! This execution space preferred device_type using device_type = Kokkos::Device<execution_space, memory_space>; @@ -82,13 +66,23 @@ class OpenMP { OpenMP(); - OpenMP(int pool_size); + explicit OpenMP(int pool_size); + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT( + "OpenMP execution space should be constructed explicitly.") + OpenMP(int pool_size) + : OpenMP(pool_size) {} +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief is the instance running a parallel algorithm - static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept; +#endif /// \brief Wait until all dispatched functors complete on the given instance /// @@ -98,22 +92,15 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Does the given instance return immediately after launching /// a parallel algorithm /// /// This always returns false on OpenMP - inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - /// \brief Partition the default instance and call 'f' on each new 'master' - /// thread - /// - /// Func is a functor with the following signiture - /// void( int partition_id, int num_partitions ) - template <typename F> - KOKKOS_DEPRECATED static void partition_master( - F const& f, int requested_num_partitions = 0, - int requested_partition_size = 0); + KOKKOS_DEPRECATED inline static bool is_asynchronous( + OpenMP const& = OpenMP()) noexcept { + return false; + } #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -166,30 +153,11 @@ class OpenMP { }; inline int OpenMP::impl_thread_pool_rank() noexcept { - // FIXME_OPENMP Can we remove this when removing partition_master? It's only - // used in one partition_master test -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - KOKKOS_IF_ON_HOST( - (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) -#else KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) -#endif KOKKOS_IF_ON_DEVICE((return -1;)) } -inline void OpenMP::impl_static_fence(std::string const& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - []() {}); -} - -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - inline int OpenMP::impl_thread_pool_size(int depth) const { return depth < 2 ? impl_thread_pool_size() : 1; } @@ -234,7 +202,9 @@ struct MemorySpaceAccess<Kokkos::OpenMP::memory_space, #include <OpenMP/Kokkos_OpenMP_Instance.hpp> #include <OpenMP/Kokkos_OpenMP_Team.hpp> +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include <OpenMP/Kokkos_OpenMP_Task.hpp> +#endif #include <KokkosExp_MDRangePolicy.hpp> /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 44f0fbc180a7466d8ac89e20becad6080b53c8ba..473a322eecf2037651c7e711d47b351d06aa7b66 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -31,77 +31,20 @@ #include <sstream> #include <thread> +namespace { +int g_openmp_hardware_max_threads = 1; +} + namespace Kokkos { namespace Impl { -void OpenMPInternal::acquire_lock() { - while (1 == desul::atomic_compare_exchange(&m_pool_mutex, 0, 1, - desul::MemoryOrderAcquire(), - desul::MemoryScopeDevice())) { - // do nothing - } -} +std::vector<OpenMPInternal *> OpenMPInternal::all_instances; +std::mutex OpenMPInternal::all_instances_mutex; -void OpenMPInternal::release_lock() { - desul::atomic_store(&m_pool_mutex, 0, desul::MemoryOrderRelease(), - desul::MemoryScopeDevice()); +int OpenMPInternal::max_hardware_threads() noexcept { + return g_openmp_hardware_max_threads; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void OpenMPInternal::validate_partition_impl(const int nthreads, - int &num_partitions, - int &partition_size) { - if (nthreads == 1) { - num_partitions = 1; - partition_size = 1; - } else if (num_partitions < 1 && partition_size < 1) { - int idle = nthreads; - for (int np = 2; np <= nthreads; ++np) { - for (int ps = 1; ps <= nthreads / np; ++ps) { - if (nthreads - np * ps < idle) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } else if (num_partitions < 1 && partition_size > 0) { - if (partition_size <= nthreads) { - num_partitions = nthreads / partition_size; - } else { - num_partitions = 1; - partition_size = nthreads; - } - } else if (num_partitions > 0 && partition_size < 1) { - if (num_partitions <= nthreads) { - partition_size = nthreads / num_partitions; - } else { - num_partitions = nthreads; - partition_size = 1; - } - } else if (num_partitions * partition_size > nthreads) { - int idle = nthreads; - const int NP = num_partitions; - const int PS = partition_size; - for (int np = NP; np > 0; --np) { - for (int ps = PS; ps > 0; --ps) { - if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } -} -#endif - void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * @@ -178,17 +121,11 @@ void OpenMPInternal::resize_thread_data(size_t pool_reduce_bytes, if (nullptr != m_pool[rank]) { m_pool[rank]->disband_pool(); - space.deallocate(m_pool[rank], old_alloc_bytes); + // impl_deallocate to not fence here + space.impl_deallocate("[unlabeled]", m_pool[rank], old_alloc_bytes); } - void *ptr = nullptr; - try { - ptr = space.allocate(alloc_bytes); - } catch ( - Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - // For now, just rethrow the error message the existing way - Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); - } + void *ptr = space.allocate("Kokkos::OpenMP::scratch_mem", alloc_bytes); m_pool[rank] = new (ptr) HostThreadTeamData(); @@ -259,9 +196,9 @@ void OpenMPInternal::initialize(int thread_count) { // Before any other call to OMP query the maximum number of threads // and save the value for re-initialization unit testing. - Impl::g_openmp_hardware_max_threads = get_current_max_threads(); + g_openmp_hardware_max_threads = get_current_max_threads(); - int process_num_threads = Impl::g_openmp_hardware_max_threads; + int process_num_threads = g_openmp_hardware_max_threads; if (Kokkos::hwloc::available()) { process_num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -274,11 +211,11 @@ void OpenMPInternal::initialize(int thread_count) { // process_num_threads if thread_count > 0, set // g_openmp_hardware_max_threads to thread_count if (thread_count < 0) { - thread_count = Impl::g_openmp_hardware_max_threads; + thread_count = g_openmp_hardware_max_threads; } else if (thread_count == 0) { - if (Impl::g_openmp_hardware_max_threads != process_num_threads) { - Impl::g_openmp_hardware_max_threads = process_num_threads; - omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + if (g_openmp_hardware_max_threads != process_num_threads) { + g_openmp_hardware_max_threads = process_num_threads; + omp_set_num_threads(g_openmp_hardware_max_threads); } } else { if (Kokkos::show_warnings() && thread_count > process_num_threads) { @@ -289,16 +226,16 @@ void OpenMPInternal::initialize(int thread_count) { << ", requested thread : " << std::setw(3) << thread_count << std::endl; } - Impl::g_openmp_hardware_max_threads = thread_count; - omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + g_openmp_hardware_max_threads = thread_count; + omp_set_num_threads(g_openmp_hardware_max_threads); } // setup thread local -#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) +#pragma omp parallel num_threads(g_openmp_hardware_max_threads) { Impl::SharedAllocationRecord<void, void>::tracking_enable(); } auto &instance = OpenMPInternal::singleton(); - instance.m_pool_size = Impl::g_openmp_hardware_max_threads; + instance.m_pool_size = g_openmp_hardware_max_threads; // New, unified host thread team data: { @@ -343,10 +280,9 @@ void OpenMPInternal::finalize() { if (this == &singleton()) { auto const &instance = singleton(); // Silence Cuda Warning - const int nthreads = - instance.m_pool_size <= Impl::g_openmp_hardware_max_threads - ? Impl::g_openmp_hardware_max_threads - : instance.m_pool_size; + const int nthreads = instance.m_pool_size <= g_openmp_hardware_max_threads + ? g_openmp_hardware_max_threads + : instance.m_pool_size; (void)nthreads; #pragma omp parallel num_threads(nthreads) @@ -355,12 +291,22 @@ void OpenMPInternal::finalize() { // allow main thread to track Impl::SharedAllocationRecord<void, void>::tracking_enable(); - Impl::g_openmp_hardware_max_threads = 1; + g_openmp_hardware_max_threads = 1; } m_initialized = false; - Kokkos::Profiling::finalize(); + // guard erasing from all_instances + { + std::scoped_lock lock(all_instances_mutex); + + auto it = std::find(all_instances.begin(), all_instances.end(), this); + if (it == all_instances.end()) + Kokkos::abort( + "Execution space instance to be removed couldn't be found!"); + *it = all_instances.back(); + all_instances.pop_back(); + } } void OpenMPInternal::print_configuration(std::ostream &s) const { @@ -368,7 +314,7 @@ void OpenMPInternal::print_configuration(std::ostream &s) const { if (m_initialized) { const int numa_count = 1; - const int core_per_numa = Impl::g_openmp_hardware_max_threads; + const int core_per_numa = g_openmp_hardware_max_threads; const int thread_per_core = 1; s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 03f5fff395a850651cd1ee894b94bc4bbed0da8e..2aed723b18ff67fbd38530fa33daf5dee7f33492 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -41,24 +41,12 @@ #include <vector> /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { - -inline bool execute_in_serial(OpenMP const& space = OpenMP()) { - return (OpenMP::in_parallel(space) && - !(omp_get_nested() && (omp_get_level() == 1))); -} - -} // namespace Impl -} // namespace Kokkos namespace Kokkos { namespace Impl { class OpenMPInternal; -inline int g_openmp_hardware_max_threads = 1; - struct OpenMPTraits { static constexpr int MAX_THREAD_COUNT = 512; }; @@ -66,7 +54,13 @@ struct OpenMPTraits { class OpenMPInternal { private: OpenMPInternal(int arg_pool_size) - : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() {} + : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() { + // guard pushing to all_instances + { + std::scoped_lock lock(all_instances_mutex); + all_instances.push_back(this); + } + } ~OpenMPInternal() { clear_thread_data(); } @@ -76,7 +70,6 @@ class OpenMPInternal { int m_pool_size; int m_level; - int m_pool_mutex = 0; HostThreadTeamData* m_pool[OpenMPTraits::MAX_THREAD_COUNT]; @@ -91,18 +84,9 @@ class OpenMPInternal { void clear_thread_data(); - int thread_pool_size() const { return m_pool_size; } + static int max_hardware_threads() noexcept; - // Acquire lock used to protect access to m_pool - void acquire_lock(); - - // Release lock used to protect access to m_pool - void release_lock(); - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - static void validate_partition_impl(const int nthreads, int& num_partitions, - int& partition_size); -#endif + int thread_pool_size() const { return m_pool_size; } void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, size_t team_shared_bytes, size_t thread_local_bytes); @@ -115,39 +99,34 @@ class OpenMPInternal { return m_pool[i]; } + int get_level() const { return m_level; } + bool is_initialized() const { return m_initialized; } bool verify_is_initialized(const char* const label) const; void print_configuration(std::ostream& s) const; -}; - -} // namespace Impl - -namespace Experimental { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template <> -class MasterLock<OpenMP> { - public: - void lock() { omp_set_lock(&m_lock); } - void unlock() { omp_unset_lock(&m_lock); } - bool try_lock() { return static_cast<bool>(omp_test_lock(&m_lock)); } - KOKKOS_DEPRECATED MasterLock() { omp_init_lock(&m_lock); } - ~MasterLock() { omp_destroy_lock(&m_lock); } + std::mutex m_instance_mutex; - MasterLock(MasterLock const&) = delete; - MasterLock(MasterLock&&) = delete; - MasterLock& operator=(MasterLock const&) = delete; - MasterLock& operator=(MasterLock&&) = delete; - - private: - omp_lock_t m_lock; + static std::vector<OpenMPInternal*> all_instances; + static std::mutex all_instances_mutex; }; + +inline bool execute_in_serial(OpenMP const& space = OpenMP()) { +// The default value returned by `omp_get_max_active_levels` with gcc version +// lower than 11.1.0 is 2147483647 instead of 1. +#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ + _OPENMP >= 201511 + bool is_nested = omp_get_max_active_levels() > 1; +#else + bool is_nested = static_cast<bool>(omp_get_nested()); #endif + return (space.impl_internal_space_instance()->get_level() < omp_get_level() && + !(is_nested && (omp_get_level() == 1))); +} -} // namespace Experimental +} // namespace Impl namespace Experimental { namespace Impl { @@ -182,7 +161,7 @@ inline std::vector<OpenMP> create_OpenMP_instances( "Kokkos::abort: Partition not enough resources left to create the last " "instance."); } - instances[weights.size() - 1] = resources_left; + instances[weights.size() - 1] = OpenMP(resources_left); return instances; } @@ -202,50 +181,6 @@ std::vector<OpenMP> partition_space(OpenMP const& main_instance, return Impl::create_OpenMP_instances(main_instance, weights); } } // namespace Experimental - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template <typename F> -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif - } // namespace Kokkos #endif diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index 96dc664eb79a58fc0c9363eca928e2d7455333dd..79d7d295c0e618b01484df29260862e3791d6c0a 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -108,6 +108,8 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> { public: inline void execute() const { + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); if (execute_in_serial(m_policy.space())) { exec_range(m_functor, m_policy.begin(), m_policy.end()); return; @@ -147,15 +149,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> { inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -210,6 +204,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, public: inline void execute() const { + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); + #ifndef KOKKOS_COMPILER_INTEL if (execute_in_serial(m_iter.m_rp.space())) { exec_range(0, m_iter.m_rp.m_num_tiles); @@ -251,16 +248,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) : m_instance(nullptr), m_iter(arg_policy, arg_functor) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } + template <typename Policy, typename Functor> static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -348,7 +338,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, const size_t team_shared_size = m_shmem_size; const size_t thread_local_size = 0; // Never shrinks - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -358,8 +349,6 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, m_functor, *(m_instance->get_thread_data()), 0, m_policy.league_size(), m_policy.league_size()); - m_instance->release_lock(); - return; } @@ -398,8 +387,6 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, data.disband_team(); } - - m_instance->release_lock(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) @@ -409,15 +396,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize<FunctorType>::value( arg_functor, arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 52cdef18e65965da7d1dcdbb263e98df6b4a60f0..d22e1e7eda0b588c81b9eb4ea154998ed9bd6087 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -83,7 +83,8 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, const size_t pool_reduce_bytes = reducer.value_size(); - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , @@ -106,6 +107,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, update); reducer.final(ptr); + return; } const int pool_size = m_instance->thread_pool_size(); @@ -157,8 +159,6 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, m_result_ptr[j] = ptr[j]; } } - - m_instance->release_lock(); } //---------------------------------------- @@ -170,15 +170,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_view.data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space, Kokkos::HostSpace>::accessible, @@ -226,7 +218,8 @@ class ParallelReduce<CombinedFunctorReducerType, const ReducerType& reducer = m_iter.m_func.get_reducer(); const size_t pool_reduce_bytes = reducer.value_size(); - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , @@ -249,8 +242,6 @@ class ParallelReduce<CombinedFunctorReducerType, reducer.final(ptr); - m_instance->release_lock(); - return; } #endif @@ -307,8 +298,6 @@ class ParallelReduce<CombinedFunctorReducerType, m_result_ptr[j] = ptr[j]; } } - - m_instance->release_lock(); } //---------------------------------------- @@ -319,15 +308,7 @@ class ParallelReduce<CombinedFunctorReducerType, : m_instance(nullptr), m_iter(arg_policy, arg_functor_reducer), m_result_ptr(arg_view.data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space, Kokkos::HostSpace>::accessible, @@ -431,7 +412,8 @@ class ParallelReduce<CombinedFunctorReducerType, const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); const size_t thread_local_size = 0; // Never shrinks - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -449,8 +431,6 @@ class ParallelReduce<CombinedFunctorReducerType, reducer.final(ptr); - m_instance->release_lock(); - return; } @@ -526,8 +506,6 @@ class ParallelReduce<CombinedFunctorReducerType, m_result_ptr[j] = ptr[j]; } } - - m_instance->release_lock(); } //---------------------------------------- @@ -543,15 +521,7 @@ class ParallelReduce<CombinedFunctorReducerType, arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize<FunctorType>::value( arg_functor_reducer.get_functor(), arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space, diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp index 02707e7fbee1f2dd4b63397135152d7515fa7c36..b9ce25d3ee56adfa04c28eafe18f5d4901883020 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Scan.hpp @@ -70,6 +70,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, const int value_count = Analysis::value_count(m_functor); const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); + m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , 0 // team_shared_bytes @@ -140,15 +143,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -201,7 +196,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, const int value_count = Analysis::value_count(m_functor); const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor); - m_instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(m_instance->m_instance_mutex); m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes , @@ -221,8 +217,6 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, *m_result_ptr = update; - m_instance->release_lock(); - return; } @@ -274,8 +268,6 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, *m_result_ptr = update_base; } } - - m_instance->release_lock(); } //---------------------------------------- @@ -292,15 +284,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space, Kokkos::HostSpace>::accessible, "Kokkos::OpenMP parallel_scan result must be host-accessible!"); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } //---------------------------------------- diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp index 3e67d8d62527d151b65dd097b6bf0aaee68f8011..54c1574d71d3264d75f8af40cbbad83f5e9f48d6 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp @@ -52,18 +52,7 @@ HostThreadTeamDataSingleton::HostThreadTeamDataSingleton() num_pool_reduce_bytes, num_team_reduce_bytes, num_team_shared_bytes, num_thread_local_bytes); - void* ptr = nullptr; - try { - ptr = space.allocate(alloc_bytes); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& f) { - // For now, just rethrow the error message with a note - // Note that this could, in turn, trigger an out of memory exception, - // but it's pretty unlikely, so we won't worry about it for now. - // TODO reasonable error message when `std::string` causes OOM error - Kokkos::Impl::throw_runtime_exception( - std::string("Failure to allocate scratch memory: ") + - f.get_error_message()); - } + void* ptr = space.allocate("Kokkos::Impl::HostThreadTeamData", alloc_bytes); HostThreadTeamData::scratch_assign( ptr, alloc_bytes, num_pool_reduce_bytes, num_team_reduce_bytes, diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 01b66948654c8d7e8c3f3a450a7f738f6f01bb9c..6edcbff0c26b593c27b02fb5b7260f809508260a 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -26,12 +26,19 @@ #include <impl/Kokkos_HostThreadTeam.hpp> #include <OpenMP/Kokkos_OpenMP.hpp> +#include <impl/Kokkos_TaskTeamMember.hpp> + #include <type_traits> #include <cassert> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -73,7 +80,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType>> { execution_space().impl_internal_space_instance(); const int pool_size = get_max_team_count(scheduler.get_execution_space()); - instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(instance->m_instance_mutex); // TODO @tasking @new_feature DSH allow team sizes other than 1 const int team_size = 1; // Threads per core @@ -152,8 +160,6 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType>> { } self.disband_team(); } // end pragma omp parallel - - instance->release_lock(); } static uint32_t get_max_team_count(execution_space const& espace) { @@ -238,7 +244,8 @@ class TaskQueueSpecializationConstrained< execution_space().impl_internal_space_instance(); const int pool_size = instance->thread_pool_size(); - instance->acquire_lock(); + // Serialize kernels on the same execution space instance + std::lock_guard<std::mutex> lock(instance->m_instance_mutex); const int team_size = 1; // Threads per core instance->resize_thread_data(0 /* global reduce buffer */ @@ -250,6 +257,7 @@ class TaskQueueSpecializationConstrained< 0 /* thread local buffer */ ); assert(pool_size % team_size == 0); + auto& queue = scheduler.queue(); queue.initialize_team_queues(pool_size / team_size); @@ -343,8 +351,6 @@ class TaskQueueSpecializationConstrained< } self.disband_team(); } // end pragma omp parallel - - instance->release_lock(); } template <typename TaskType> @@ -361,6 +367,10 @@ extern template class TaskQueue<Kokkos::OpenMP, } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp index a37e1758a26104784a816a311bbae7c7475f6040..5937c093ba17c1729d55b30dfeeeaadb1a00ead5 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp @@ -105,7 +105,8 @@ class UniqueToken<OpenMP, UniqueTokenScope::Global> { /// \brief upper bound for acquired values, i.e. 0 <= value < size() KOKKOS_INLINE_FUNCTION int size() const noexcept { - KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;)) + KOKKOS_IF_ON_HOST( + (return Kokkos::Impl::OpenMPInternal::max_hardware_threads();)) KOKKOS_IF_ON_DEVICE((return 0;)) } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index adf972dd081f6a6c055d39965b1d06c4a6daf03e..b20bb7a345aa7b3b2b4cc94f2bbe95ac940f163d 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -33,7 +33,6 @@ static_assert(false, #include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp> #include <Kokkos_ScratchSpace.hpp> #include <Kokkos_Parallel.hpp> -#include <Kokkos_TaskScheduler.hpp> #include <Kokkos_Layout.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> #include <impl/Kokkos_InitializationSettings.hpp> @@ -65,7 +64,11 @@ class OpenMPTarget { using scratch_memory_space = ScratchMemorySpace<OpenMPTarget>; - inline static bool in_parallel() { return omp_in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static bool in_parallel() { + return omp_in_parallel(); + } +#endif static void fence(const std::string& name = "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence"); @@ -142,8 +145,8 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp> -#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp> -#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp> /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 81fbc56de00535c03975f361447cdd14fd78f0f4..635b0e0504fc7fd10704d99b01c62545712f0329 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -37,7 +37,6 @@ #include <OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp> #include <impl/Kokkos_Error.hpp> #include <Kokkos_Atomic.hpp> -#include <impl/Kokkos_MemorySpace.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -55,9 +54,11 @@ void* OpenMPTargetSpace::impl_allocate( static_assert(sizeof(void*) == sizeof(uintptr_t), "Error sizeof(void*) != sizeof(uintptr_t)"); - void* ptr; + void* ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device()); - ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device()); + if (!ptr) { + Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); + } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -110,79 +111,13 @@ void OpenMPTargetSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord<void, void> SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record; -#endif - -SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, - void>::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, - void>::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // TODO DeepCopy - // DeepCopy - Kokkos::Impl::DeepCopy<Experimental::OpenMPTargetSpace, HostSpace>( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, " - "void>::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 #include <impl/Kokkos_SharedAlloc_timpl.hpp> -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenMPTargetSpace); // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index e5b33d0982f83e9bb5ae720734d3f0081fbec4ba..ec33d25b9695338a064b6d7cd6cd5e8b540870fb 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -28,6 +28,7 @@ static_assert(false, #include <typeinfo> #include <Kokkos_Core_fwd.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp> #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -91,13 +92,23 @@ class OpenMPTargetSpace { /**\brief Default memory space instance */ OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; ~OpenMPTargetSpace() = default; /**\brief Allocate untracked memory in the space */ + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -121,9 +132,6 @@ class OpenMPTargetSpace { const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>; }; } // namespace Experimental } // namespace Kokkos @@ -131,138 +139,8 @@ class OpenMPTargetSpace { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void> - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend Kokkos::Experimental::OpenMPTargetSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenMPTargetSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenMPTargetSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc) { - KOKKOS_IF_ON_HOST( - (return new SharedAllocationRecord(arg_space, arg_label, arg_alloc);)) - KOKKOS_IF_ON_DEVICE( - ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;)) - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template <class ExecutionSpace> -struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, - Kokkos::Experimental::OpenMPTargetSpace, ExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, OpenMPTargetSpace>: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template <class ExecutionSpace> -struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace, - ExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, HostSpace>: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template <class ExecutionSpace> -struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace, - ExecutionSpace> { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast<void*>(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy<HostSpace, OpenMPTargetSpace>: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast<void*>(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenMPTargetSpace); #endif #endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aace09e266b0395b76be37c7beedf2db4a09f152 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include <Kokkos_Macros.hpp> +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP +#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP + +#include <OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, + Kokkos::Experimental::OpenMPTargetSpace, ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, OpenMPTargetSpace>: fence " + "before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, HostSpace>: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast<void*>(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast<void*>(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy<HostSpace, OpenMPTargetSpace>: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast<void*>(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp deleted file mode 100644 index 1902c38409a98d80689cbb13a9879d7a7db23ec2..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ /dev/null @@ -1,164 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include <stdio.h> -#include <limits> -#include <iostream> -#include <vector> -#include <Kokkos_Core.hpp> -#include <impl/Kokkos_Error.hpp> -#include <iostream> -#include <impl/Kokkos_CPUDiscovery.hpp> -#include <impl/Kokkos_Tools.hpp> - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -// FIXME_OPENMPTARGET currently unused -/* -namespace Kokkos { -namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() { return omp_in_parallel(); } - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos -*/ - -namespace Kokkos { -namespace Impl { - -void OpenMPTargetExec::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetExec::verify_initialized(const char* const label) { - if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { - std::string msg(label); - msg.append(" ERROR: not initialized"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void* OpenMPTargetExec::m_scratch_ptr = nullptr; -int64_t OpenMPTargetExec::m_scratch_size = 0; -int* OpenMPTargetExec::m_lock_array = nullptr; -uint64_t OpenMPTargetExec::m_lock_size = 0; -uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; -int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; - -void OpenMPTargetExec::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void OpenMPTargetExec::clear_lock_array() { - if (m_lock_array != nullptr) { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_lock_array, m_lock_size); - m_lock_array = nullptr; - m_lock_size = 0; - } -} - -void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. -#if !defined(KOKKOS_COMPILER_CRAY_LLVM) - omp_set_num_teams(max_active_teams * 2); -#endif - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -int* OpenMPTargetExec::get_lock_array(int num_teams) { - Kokkos::Experimental::OpenMPTargetSpace space; - int max_active_league_size = MAX_ACTIVE_THREADS / 32; - int lock_array_elem = - (num_teams > max_active_league_size) ? num_teams : max_active_league_size; - if (m_lock_size < (lock_array_elem * sizeof(int))) { - space.deallocate(m_lock_array, m_lock_size); - m_lock_size = lock_array_elem * sizeof(int); - m_lock_array = static_cast<int*>(space.allocate(m_lock_size)); - - // FIXME_OPENMPTARGET - Creating a target region here to initialize the - // lock_array with 0's fails. Hence creating an equivalent host array to - // achieve the same. Value of host array are then copied to the lock_array. - int* h_lock_array = static_cast<int*>( - omp_target_alloc(m_lock_size, omp_get_initial_device())); - - for (int i = 0; i < lock_array_elem; ++i) h_lock_array[i] = 0; - - if (0 < m_lock_size) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - m_lock_array, h_lock_array, m_lock_size, 0, 0, - omp_get_default_device(), omp_get_initial_device())); - - omp_target_free(h_lock_array, omp_get_initial_device()); - } - - return m_lock_array; -} - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..13b509c0ada0dbfc289f047257329fced06315dd --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP +#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP + +#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp> +#include <type_traits> + +namespace Kokkos::Experimental::Impl { + +template <class Functor, class Policy> +class FunctorAdapter { + Functor m_functor; + using WorkTag = typename Policy::work_tag; + + public: + FunctorAdapter() = default; + FunctorAdapter(Functor const &functor) : m_functor(functor) {} + + Functor get_functor() const { return m_functor; } + + template <class... Args> + KOKKOS_FUNCTION void operator()(Args &&...args) const { + if constexpr (std::is_void_v<WorkTag>) { + m_functor(static_cast<Args &&>(args)...); + } else { + m_functor(WorkTag(), static_cast<Args &&>(args)...); + } + } +}; + +} // namespace Kokkos::Experimental::Impl + +#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 9e8844a6f20846ee675d24c66c987fd3706288c1..53e723882f55c2672c08e85680042c6651fbac52 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,11 +27,11 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#include <Kokkos_Core.hpp> #include <OpenMPTarget/Kokkos_OpenMPTarget.hpp> #include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp> #include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp> #include <impl/Kokkos_ExecSpaceManager.hpp> -#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp> #include <sstream> @@ -105,19 +105,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; - Kokkos::Impl::OpenMPTargetExec space; - if (space.m_lock_array != nullptr) space.clear_lock_array(); - if (space.m_uniquetoken_ptr != nullptr) + if (m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>( - space.m_uniquetoken_ptr); + m_uniquetoken_ptr); } void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; - Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency(); - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams. @@ -137,7 +133,75 @@ OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { return &self; } -} // Namespace Impl +void OpenMPTargetInternal::verify_is_process(const char* const label) { + // Fails if the current task is in a parallel region or is not on the host. + if (omp_in_parallel() && (!omp_is_initial_device())) { + std::string msg(label); + msg.append(" ERROR: in parallel or on device"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetInternal::resize_scratch(int64_t team_size, + int64_t shmem_size_L0, + int64_t shmem_size_L1, + int64_t league_size) { + Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. +#if !defined(KOKKOS_COMPILER_CRAY_LLVM) + omp_set_num_teams(max_active_teams * 2); +#endif + + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + + ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + max_active_teams * 2; + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +} // namespace Impl OpenMPTarget::OpenMPTarget() : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} @@ -178,8 +242,10 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { } void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_visible_devices; + std::vector<int> const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings); + const int device_num = get_gpu(settings).value_or(visible_devices[0]); omp_set_default_device(device_num); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); @@ -205,9 +271,9 @@ namespace Experimental { UniqueToken<Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::UniqueTokenScope::Global>:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { + UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { #ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; int count = Kokkos::Experimental::OpenMPTarget().concurrency(); if (ptr == nullptr) { int size = count * sizeof(uint32_t); @@ -220,7 +286,7 @@ UniqueToken<Kokkos::Experimental::OpenMPTarget, 0, omp_get_default_device(), omp_get_initial_device())); - Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr = ptr; + space.impl_internal_space_instance()->m_uniquetoken_ptr = ptr; } #else // FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` @@ -228,8 +294,7 @@ UniqueToken<Kokkos::Experimental::OpenMPTarget, // Version 1 - Creating a target region and filling the // pointer Error - CUDA error: named symbol not found #pragma omp target teams distribute parallel for is_device_ptr(ptr) \ - map(to \ - : size) + map(to : size) for (int i = 0; i < count; ++i) ptr[i] = 0; // Version 2 : Allocating a view on the device and filling it with a scalar diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp index bea3bb3b12b8f6161159871516eaca663b99d7b5..cffe056f0d0019ff11a6eea13b8282f9eb95c615 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp @@ -17,8 +17,6 @@ #ifndef KOKKOS_OPENMPTARGET_INSTANCE_HPP #define KOKKOS_OPENMPTARGET_INSTANCE_HPP -#include <Kokkos_Core.hpp> - namespace Kokkos { namespace Experimental { namespace Impl { @@ -27,9 +25,9 @@ enum class openmp_fence_is_static { yes, no }; class OpenMPTargetInternal { private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = default; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; public: void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); @@ -55,6 +53,19 @@ class OpenMPTargetInternal { static OpenMPTargetInternal* impl_singleton(); + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + void* get_scratch_ptr(); + void clear_scratch(); + void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + void* m_scratch_ptr = nullptr; + std::mutex m_mutex_scratch_ptr; + int64_t m_scratch_size = 0; + uint32_t* m_uniquetoken_ptr = nullptr; + private: bool m_is_initialized = false; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp index d718f56d38b0382dc16d0684a5bf6efa42bd68cb..e353676b61785cae8103812d0f7a868a6b5ee7f9 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp @@ -22,6 +22,10 @@ namespace Kokkos { namespace Impl { +using OpenMPTargetIterateLeft = std::integral_constant<Iterate, Iterate::Left>; +using OpenMPTargetIterateRight = + std::integral_constant<Iterate, Iterate::Right>; + template <typename Rank, ::Kokkos::Impl::TeamMDRangeThreadAndVector ThreadAndVector> struct ThreadAndVectorNestLevel<Rank, Kokkos::Experimental::OpenMPTarget, @@ -30,4 +34,5 @@ struct ThreadAndVectorNestLevel<Rank, Kokkos::Experimental::OpenMPTarget, } // namespace Impl } // namespace Kokkos + #endif diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2bd672f4d06b5bdcdf832413b21a6a4e8f90e080 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP +#define KOKKOS_OPENMPTARGET_MACROS_HPP + +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + +// Define a macro for llvm compiler greater than version 17 and on NVIDIA and +// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm +// extensions can be used. +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \ + (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)) +#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#endif + +#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x) +#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \ + KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x) + +// Use scratch memory extensions to request dynamic shared memory for the +// right compiler/architecture combination. +#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N) +#else +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) +#endif + +#endif // KOKKOS_OPENMPTARGET_MACROS_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 9767d8e53eff19f1b888c3e0fbf78e2623cf3416..f71f8887135eb27ed4cca43a0ed631ae62c32500 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -21,16 +21,10 @@ #include <sstream> #include <Kokkos_Parallel.hpp> #include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_Spinwait.hpp> #include <Kokkos_Atomic.hpp> #include "Kokkos_OpenMPTarget_Abort.hpp" - -// Intel architectures prefer the classical hierarchical parallelism that relies -// on OpenMP. -#if defined(KOKKOS_ARCH_INTEL_GPU) -#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU -#endif +#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -113,14 +107,20 @@ class OpenMPTargetExecTeamMember { team_broadcast(value, thread_id); } - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template <class ValueType, class JoinOp> - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + // FIXME_OPENMPTARGET this function currently ignores the reducer passed. + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + team_reduce(ReducerType const&, + typename ReducerType::value_type& value) const noexcept { #pragma omp barrier - using value_type = ValueType; + using value_type = typename ReducerType::value_type; // const JoinLambdaAdapter<value_type, JoinOp> op(op_in); // Make sure there is enough scratch space: @@ -149,8 +149,9 @@ class OpenMPTargetExecTeamMember { } #pragma omp barrier } - return team_scratch[0]; + value = team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. * @@ -249,15 +250,37 @@ class OpenMPTargetExecTeamMember { // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for // hierarchical reduction. There is an additional 10% of the requested // scratch memory allocated per team as padding. Hence the product with 0.1. + // + // Use llvm extensions for dynamic shared memory with compilers/architecture + // combinations where it is supported. + // + // Size allocated in HBM will now change based on whether we use llvm + // extensions. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1; +#else + const int total_shmem = + shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1; +#endif + + // Per team offset for buffer in HBM. const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE); + +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE; + char* l0_scratch = + static_cast<char*>(llvm_omp_target_dynamic_shared_alloc()); + m_team_shared = scratch_memory_space( + l0_scratch, shmem_size_L0, static_cast<char*>(glb_scratch) + l1_offset, + shmem_size_L1); +#else const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; const int l1_offset = l0_offset + shmem_size_L0; m_team_shared = scratch_memory_space( (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0, static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1); +#endif m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset; m_league_rank = league_rank; m_team_rank = omp_tid; @@ -718,46 +741,6 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { } // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static int MAX_ACTIVE_THREADS; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static int* get_lock_array(int num_teams); - static void* get_scratch_ptr(); - static void clear_scratch(); - static void clear_lock_array(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static int64_t m_scratch_size; - static int* m_lock_array; - static uint64_t m_lock_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl } // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..38ed7c5681a1ec3bf168e5dab03646a6623f57e3 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -0,0 +1,340 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP + +#include <omp.h> +#include <Kokkos_Parallel.hpp> +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy<Traits...>; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy>; + const FunctorAdapter m_functor; + + const Policy m_policy; + + public: + inline void execute() const { + Experimental::Impl::OpenMPTargetInternal::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + Experimental::Impl::OpenMPTargetInternal::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + + Policy policy = m_policy; + + static_assert(1 < Policy::rank && Policy::rank < 7); + static_assert(Policy::inner_direction == Iterate::Left || + Policy::inner_direction == Iterate::Right); + + execute_tile<Policy::rank>( + m_functor, policy, + std::integral_constant<Iterate, Policy::inner_direction>()); + } + + template <int Rank> + inline std::enable_if_t<Rank == 2> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + functor(i0, i1); + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 3> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + functor(i0, i1, i2); + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 4> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + functor(i0, i1, i2, i3); + } + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 5> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + functor(i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 6> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + { + functor(i0, i1, i2, i3, i4, i5); + } + } + } + } + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 2> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i1 = begin_1; i1 < end_1; ++i1) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1); + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 3> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2); + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 4> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3); + } + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 5> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, i4); + } + } + } + } + } + } + + template <int Rank> + inline std::enable_if_t<Rank == 6> execute_tile( + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + { + functor(i0, i1, i2, i3, i4, i5); + } + } + } + } + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp index a674637a3b1aca146e18e07b6b84912f5280d93f..502461cc5e08a4bdfeaf181f832c3b9a8c9835a1 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -20,6 +20,8 @@ #include <omp.h> #include <sstream> #include <Kokkos_Parallel.hpp> +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" namespace Kokkos { namespace Impl { @@ -28,36 +30,30 @@ template <class FunctorType, class... Traits> class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy<Traits...>; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy<Traits...>; + using Member = typename Policy::member_type; - const FunctorType m_functor; + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy> m_functor; const Policy m_policy; public: - void execute() const { execute_impl<WorkTag>(); } + void execute() const { execute_impl(); } - template <class TagType> void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto begin = m_policy.begin(); const auto end = m_policy.end(); if (end <= begin) return; - FunctorType a_functor(m_functor); + auto const a_functor(m_functor); #pragma omp target teams distribute parallel for map(to : a_functor) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void<TagType>::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } + a_functor(i); } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 1abc925caed58c9e91ef054b1abfb5111876a9d7..77dc71a87b7836f3aebbd8a282015d62b07b33f8 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -19,8 +19,10 @@ #include <omp.h> #include <sstream> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp> #include <Kokkos_Parallel.hpp> #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp> namespace Kokkos { @@ -75,28 +77,27 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, using Policy = Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; + + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy> m_functor; - const FunctorType m_functor; const Policy m_policy; const size_t m_shmem_size; public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl<WorkTag>(); + execute_impl(); } private: - template <class TagType> void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto league_size = m_policy.league_size(); const auto team_size = m_policy.team_size(); @@ -104,11 +105,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); + m_policy.space().impl_internal_space_instance()->resize_scratch( + team_size, shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); + void* scratch_ptr = + m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); + auto const a_functor(m_functor); // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the // scratch implementation does not work in the Release or RelWithDebugInfo @@ -121,7 +123,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(m_policy.space().concurrency() / team_size, league_size); #endif // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the @@ -140,8 +142,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ - num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams thread_limit(team_size) firstprivate(a_functor) + num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel { if (omp_get_num_teams() > max_active_teams) @@ -158,16 +162,13 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, typename Policy::member_type team(league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #else #pragma omp target teams distribute firstprivate(a_functor) \ is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ - thread_limit(team_size) + thread_limit(team_size) for (int i = 0; i < league_size; i++) { #pragma omp parallel { @@ -177,10 +178,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, typename Policy::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #endif diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bee604834c7863f23487df5ae4de542b0f5f1338 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -0,0 +1,619 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP + +#include <omp.h> +#include <Kokkos_Parallel.hpp> +#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class CombinedFunctorReducerType, class... Traits> +class ParallelReduce<CombinedFunctorReducerType, + Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy<Traits...>; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + + static constexpr bool UseReducer = + !std::is_same_v<FunctorType, typename ReducerType::functor_type>; + + const pointer_type m_result_ptr; + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + + using ParReduceCopy = ParallelReduceCopy<pointer_type>; + + bool m_result_ptr_on_device; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy>; + + public: + inline void execute() const { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock<std::mutex> scratch_memory_lock( + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); + execute_tile<Policy::rank, typename ReducerType::value_type>( + functor, m_policy, m_result_ptr, + std::integral_constant<Iterate, Policy::inner_direction>()); + } + + template <class ViewType> + inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + Policy arg_policy, const ViewType& arg_result_view) + : m_result_ptr(arg_result_view.data()), + m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr_on_device( + MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace, + typename ViewType::memory_space>::accessible) {} + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 2> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, result); + } + } + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(+ : result) + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, result); + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 3> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, result); + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(+ : result) + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, result); + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 4> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, result); + } + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(+ : result) + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, result); + } + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 5> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, i4, result); + } + } + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(+ : result) + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, i4, result); + } + } + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 6> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateLeft) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, i4, i5, result); + } + } + } + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(+ : result) + for (auto i5 = begin_5; i5 < end_5; ++i5) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i0 = begin_0; i0 < end_0; ++i0) { + functor(i0, i1, i2, i3, i4, i5, result); + } + } + } + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 2> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + functor(i0, i1, result); + } + } + } else { +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(+ : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + functor(i0, i1, result); + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 3> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + functor(i0, i1, i2, result); + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(+ : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + functor(i0, i1, i2, result); + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 4> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[3]; + const Index begin_3 = policy.m_lower[2]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + functor(i0, i1, i2, i3, result); + } + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(+ : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + functor(i0, i1, i2, i3, result); + } + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 5> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + functor(i0, i1, i2, i3, i4, result); + } + } + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(+ : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + functor(i0, i1, i2, i3, i4, result); + } + } + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <int Rank, class ValueType> + inline std::enable_if_t<Rank == 6> execute_tile( + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, + OpenMPTargetIterateRight) const { + const Index begin_0 = policy.m_lower[0]; + const Index begin_1 = policy.m_lower[1]; + const Index begin_2 = policy.m_lower[2]; + const Index begin_3 = policy.m_lower[3]; + const Index begin_4 = policy.m_lower[4]; + const Index begin_5 = policy.m_lower[5]; + + const Index end_0 = policy.m_upper[0]; + const Index end_1 = policy.m_upper[1]; + const Index end_2 = policy.m_upper[2]; + const Index end_3 = policy.m_upper[3]; + const Index end_4 = policy.m_upper[4]; + const Index end_5 = policy.m_upper[5]; + + ValueType result = ValueType(); + + // FIXME_OPENMPTARGET: Unable to separate directives and their companion + // loops which leads to code duplication for different reduction types. + if constexpr (UseReducer) { +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + functor(i0, i1, i2, i3, i4, i5, result); + } + } + } + } + } + } + } else { +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(+ : result) + for (auto i0 = begin_0; i0 < end_0; ++i0) { + for (auto i1 = begin_1; i1 < end_1; ++i1) { + for (auto i2 = begin_2; i2 < end_2; ++i2) { + for (auto i3 = begin_3; i3 < end_3; ++i3) { + for (auto i4 = begin_4; i4 < end_4; ++i4) { + for (auto i5 = begin_5; i5 < end_5; ++i5) { + functor(i0, i1, i2, i3, i4, i5, result); + } + } + } + } + } + } + } + + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); + } + + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4452af3846d28b96d17116236ceeb3f195cc16b8..b7c8abcb449566f1a995a8e488fe07e6528187f5 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -21,6 +21,7 @@ #include <sstream> #include <Kokkos_Parallel.hpp> #include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp> namespace Kokkos { namespace Impl { @@ -33,8 +34,6 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -55,11 +54,17 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; public: void execute() const { - const FunctorType& functor = m_functor_reducer.get_functor(); + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock<std::mutex> scratch_memory_lock( + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy>( + m_functor_reducer.get_functor()); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -72,26 +77,26 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array<TagType, 2>( + ParReduceSpecialize::template execute_array<2>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array<TagType, 4>( + ParReduceSpecialize::template execute_array<4>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array<TagType, 8>( + ParReduceSpecialize::template execute_array<8>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array<TagType, 16>( + ParReduceSpecialize::template execute_array<16>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array<TagType, 32>( + ParReduceSpecialize::template execute_array<32>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array<TagType, 1>( + ParReduceSpecialize::template execute_array<1>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index a302fa7151152d6c4199d189cfa68b491076a430..b81e3aa7ed0b1b97960cb5bd784c4d61548033c0 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -59,7 +59,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier if constexpr (std::is_arithmetic<ValueType>::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) +#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -68,7 +68,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for reduction(custom : TeamThread_scratch[:1]) +#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -90,11 +90,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< const Lambda& lambda, ReducerType result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(custominner \ +:ValueType : Impl::OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of // elements in the array <= 32. For reduction we allocate, 16 bytes per @@ -109,7 +108,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]); #pragma omp barrier -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) +#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamThread_scratch[0]); } @@ -132,11 +131,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< ValueType* TeamThread_scratch = static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -145,8 +143,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) +#pragma omp for reduction( \ + omp_red_teamthread_reducer : TeamThread_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -259,11 +258,10 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< const Lambda& lambda, ReducerType const& result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) ValueType vector_reduce; Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce); @@ -329,7 +327,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( #pragma omp barrier if constexpr (std::is_arithmetic<ValueType>::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) +#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -338,7 +336,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -363,11 +361,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< static_assert(sizeof(ValueType) <= Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) ValueType* TeamVector_scratch = static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); @@ -376,7 +373,7 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]); #pragma omp barrier -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamVector_scratch[0]); } @@ -400,11 +397,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< ValueType* TeamVector_scratch = static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -413,8 +409,9 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) +#pragma omp for simd reduction( \ + omp_red_teamthread_reducer : TeamVector_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -443,8 +440,7 @@ class ParallelReduce<CombinedFunctorReducerType, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -472,7 +468,12 @@ class ParallelReduce<CombinedFunctorReducerType, public: void execute() const { - const FunctorType& functor = m_functor_reducer.get_functor(); + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock<std::mutex> scratch_memory_lock( + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy>( + m_functor_reducer.get_functor()); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 1d6677a1df6ba49cd3e720f8bcf67696779dda32..ec8a96cb2f36b0619bfe76bda2695527d46769d0 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -20,6 +20,7 @@ #include <omp.h> #include <sstream> #include <Kokkos_Parallel.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp> namespace Kokkos { namespace Impl { @@ -30,7 +31,6 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, protected: using Policy = Kokkos::RangePolicy<Traits...>; - using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using idx_type = typename Policy::index_type; @@ -48,18 +48,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - template <class TagType> - std::enable_if_t<std::is_void<TagType>::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); - } - template <class TagType> - std::enable_if_t<!std::is_void<TagType>::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); - } + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, Policy>; public: void impl_execute( @@ -77,8 +67,10 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, idx_type team_size = 128; auto a_functor_reducer = m_functor_reducer; -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) + auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); + +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -91,9 +83,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, const idx_type idx = local_offset + i; value_type val; reducer.init(&val); - if (idx < N) - call_with_tag<WorkTag>(a_functor_reducer.get_functor(), idx, val, - false); + if (idx < N) a_functor(idx, val, false); + element_values(team_id, i) = val; } #pragma omp barrier @@ -120,9 +111,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, } } -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) \ - thread_limit(team_size) +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -143,14 +133,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, local_offset_value = element_values(team_id, i - 1); // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ - !defined(KOKKOS_ARCH_AMD_GFX1100) + !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) if constexpr (Analysis::Reducer::has_join_member_function()) { - if constexpr (std::is_void_v<WorkTag>) - a_functor_reducer.get_functor().join(local_offset_value, - offset_value); - else - a_functor_reducer.get_functor().join( - WorkTag{}, local_offset_value, offset_value); + a_functor.get_functor().join(local_offset_value, offset_value); } else local_offset_value += offset_value; #else @@ -158,9 +143,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, #endif } else local_offset_value = offset_value; - if (idx < N) - call_with_tag<WorkTag>(a_functor_reducer.get_functor(), idx, - local_offset_value, true); + if (idx < N) a_functor(idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -169,14 +153,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, } void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const idx_type N = m_policy.end() - m_policy.begin(); const idx_type chunk_size = 128; const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock<std::mutex> scratch_memory_lock( + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View<value_type**, Kokkos::LayoutRight, Kokkos::Experimental::OpenMPTargetSpace> @@ -216,15 +204,21 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); const int chunk_size = 128; const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; if (N > 0) { + // Only let one ParallelReduce instance at a time use the scratch memory. + std::scoped_lock<std::mutex> scratch_memory_lock( + base_t::m_policy.space() + .impl_internal_space_instance() + ->m_mutex_scratch_ptr); + // This could be scratch memory per team Kokkos::View<value_type**, Kokkos::LayoutRight, Kokkos::Experimental::OpenMPTargetSpace> @@ -238,8 +232,10 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, if (!base_t::m_result_ptr_device_accessible) { const int size = base_t::m_functor_reducer.get_reducer().value_size(); - DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace, + Kokkos::Experimental::OpenMPTarget>( + base_t::m_policy.space(), base_t::m_result_ptr, + chunk_values.data() + (n_chunks - 1), size); } } else if (!base_t::m_result_ptr_device_accessible) { base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index fb75f05f270104f6764a39977106562b7cdd226a..3af8638ed830309b09beb9e1744799642df47a5c 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -21,6 +21,8 @@ #include <sstream> #include <Kokkos_Parallel.hpp> #include <OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp> namespace Kokkos { namespace Impl { @@ -71,7 +73,6 @@ template <class FunctorType, class ReducerType, class PointerType, struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ReducerType, PointerType, ValueType> { using PolicyType = Kokkos::RangePolicy<PolicyArgs...>; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t<std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType>; @@ -81,12 +82,15 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, using ParReduceCopy = ParallelReduceCopy<PointerType>; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, PolicyType>; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); const auto begin = p.begin(); @@ -103,33 +107,27 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, return; } -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v<TagType>) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), ptr_on_device); } - template <class TagType, int NumReductions> - static void execute_array(const FunctorType& f, const PolicyType& p, + template <int NumReductions> + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); const auto begin = p.begin(); @@ -149,27 +147,14 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic<ValueType>::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v<TagType>) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result) + for (auto i = begin; i < end; ++i) f(i, result); } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v<TagType>) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) + for (auto i = begin; i < end; ++i) f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -185,13 +170,10 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ptr_on_device); return; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result[ : NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v<TagType>) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result( @@ -199,12 +181,12 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, } } - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); const auto begin = p.begin(); @@ -218,23 +200,25 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, const auto size = end - begin; - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // FIXME_OPENMPTARGET: The team size and concurrency are currently // based on NVIDIA-V100 and should be modifid to be based on the // architecture in the future. const int max_team_threads = 32; const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + p.space().impl_internal_space_instance()->concurrency() / + max_team_threads; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. Achieved by setting the first // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits<int64_t>::max()); - ValueType* scratch_ptr = - static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr()); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), + std::numeric_limits<int64_t>::max()); + ValueType* scratch_ptr = static_cast<ValueType*>( + p.space().impl_internal_space_instance()->get_scratch_ptr()); - typename FunctorAnalysis::Reducer final_reducer(f); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { #pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) @@ -259,8 +243,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, } #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : final_reducer) is_device_ptr(scratch_ptr) + map(to : final_reducer) is_device_ptr(scratch_ptr) { #pragma omp parallel { @@ -278,11 +261,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void_v<TagType>) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } // Reduce all paritial results within a team. @@ -303,8 +282,7 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to : f) \ is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { @@ -343,7 +321,6 @@ template <class FunctorType, class ReducerType, class PointerType, struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, ReducerType, PointerType, ValueType> { using PolicyType = TeamPolicyInternal<PolicyArgs...>; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t<std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType>; @@ -354,12 +331,15 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, using ParReduceCopy = ParallelReduceCopy<PointerType>; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter<FunctorType, PolicyType>; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); @@ -369,9 +349,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); ValueType result = ValueType(); @@ -382,21 +364,22 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. if (max_active_teams <= 0) return; -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper<ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper<ReducerType>::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(max_active_teams) thread_limit(team_size) + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom + : result) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -411,16 +394,13 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #else #pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ - num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ - : result) + num_teams(max_active_teams) thread_limit(team_size) \ + reduction(custom : result) for (int i = 0; i < league_size; i++) { #pragma omp parallel reduction(custom : result) { @@ -430,10 +410,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, typename PolicyType::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #endif @@ -444,12 +421,12 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, } template <int NumReductions> - static void execute_array(const FunctorType& f, const PolicyType& p, + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); @@ -459,9 +436,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); // Maximum active teams possible. // FIXME_OPENMPTARGET: Cray compiler did not yet implement @@ -470,7 +449,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. @@ -482,9 +461,11 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic<ValueType>::value) { -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) + // Use scratch memory extensions to request dynamic shared memory for + // the right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \ + is_device_ptr(scratch_ptr) reduction(+: result) \ + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(+ : result) { if (omp_get_num_teams() > max_active_teams) @@ -499,19 +480,14 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) #pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to \ - : f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -526,10 +502,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } @@ -540,10 +513,10 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to : f) is_device_ptr(scratch_ptr) \ + reduction(+ : result[ : NumReductions]) +#pragma omp parallel reduction(+ : result[ : NumReductions]) { if (omp_get_num_teams() > max_active_teams) Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); @@ -557,10 +530,7 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } @@ -572,12 +542,12 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join "); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join"); using FunctorAnalysis = @@ -606,13 +576,14 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, const auto nteams = league_size; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { // If there is no work to be done, copy back the initialized values and @@ -636,11 +607,13 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, return; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { + // Use scratch memory extensions to request dynamic shared memory for the + // right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(nteams) thread_limit(team_size) map(to + : f) + is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) { #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -654,20 +627,15 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v<TagType>) { - f(team, result); - } else { - f(TagType(), team, result); - } + f(team, result); } } // end parallel } // end target int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : final_reducer) \ - is_device_ptr(scratch_ptr) +#pragma omp target teams distribute parallel for simd firstprivate( \ + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp deleted file mode 100644 index 41e62ce6e6b32e155f8fe886fdfd9132afe6247c..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ /dev/null @@ -1,787 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP - -#include <omp.h> -#include <Kokkos_Parallel.hpp> -#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp> -#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp> - -// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, -// this was tracked down to a bug in clang with regards of mapping structs -// with arrays of long in it. Arrays of int might be fine though ... -#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <class FunctorType, class... Traits> -class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy<Traits...>; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - const FunctorType m_functor; - const Policy m_policy; - - public: - inline void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); - Policy policy = m_policy; - -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - typename Policy::point_type unused; - - execute_tile<Policy::rank>(unused, functor, policy); -#else - const int64_t begin = 0; - const int64_t end = m_policy.m_num_tiles; - -#pragma omp target teams distribute map(to : functor) num_teams(end - begin) - { - for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) { - -#pragma omp parallel - { - typename Policy::point_type offset; - if (Policy::outer_direction == Policy::Left) { - for (int i = 0; i < Policy::rank; ++i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } else { - for (int i = Policy::rank - 1; i >= 0; --i) { - offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + - policy.m_lower[i]; - tile_idx /= policy.m_tile_end[i]; - } - } - execute_tile<Policy::rank>(offset, functor, policy); - } - } - } -#endif - } - - template <int Rank> - inline std::enable_if_t<Rank == 2> execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); - } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - -#pragma omp for collapse(2) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); - } -#endif - } - - template <int Rank> - inline std::enable_if_t<Rank == 3> execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); - } - } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - -#pragma omp for collapse(3) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); - } -#endif - } - - template <int Rank> - inline std::enable_if_t<Rank == 4> execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); - } - } - } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - -#pragma omp for collapse(4) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); - } -#endif - } - - template <int Rank> - inline std::enable_if_t<Rank == 5> execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); - } - } - } - } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - -#pragma omp for collapse(5) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); - } -#endif - } - - template <int Rank> - inline std::enable_if_t<Rank == 6> execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy) const { -#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES - (void)offset; - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - const Index begin_5 = policy.m_lower[5]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - const Index end_5 = policy.m_upper[5]; - -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); - } - } - } - } - } - } - } -#else - const ptrdiff_t begin_0 = offset[0]; - ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; - end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; - - const ptrdiff_t begin_1 = offset[1]; - ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; - end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; - - const ptrdiff_t begin_2 = offset[2]; - ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; - end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; - - const ptrdiff_t begin_3 = offset[3]; - ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; - end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; - - const ptrdiff_t begin_4 = offset[4]; - ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; - end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; - - const ptrdiff_t begin_5 = offset[5]; - ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; - end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; - -#pragma omp for collapse(6) - for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0) - for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) - for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) - for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) - for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) - for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); - } -#endif - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template <typename Policy, typename Functor> - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <class CombinedFunctorReducerType, class... Traits> -class ParallelReduce<CombinedFunctorReducerType, - Kokkos::MDRangePolicy<Traits...>, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy<Traits...>; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v<FunctorType, typename ReducerType::functor_type>; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy<pointer_type>; - - bool m_result_ptr_on_device; - - public: - inline void execute() const { - execute_tile<Policy::rank, typename ReducerType::value_type>( - m_functor_reducer.get_functor(), m_policy, m_result_ptr); - } - - template <class ViewType> - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(arg_result_view.data()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace, - typename ViewType::memory_space>::accessible) {} - - template <int Rank, class ValueType> - inline std::enable_if_t<Rank == 2> execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); - } - } - } else { -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template <int Rank, class ValueType> - inline std::enable_if_t<Rank == 3> execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <typename ReducerType::functor_type>::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper <typename ReducerType::functor_type>::init( \ - omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void<typename Policy::work_tag>::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template <int Rank, class ValueType> - inline std::enable_if_t<Rank == 4> execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[3]; - const Index begin_3 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template <int Rank, class ValueType> - inline std::enable_if_t<Rank == 5> execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); - } - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); - } - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template <int Rank, class ValueType> - inline std::enable_if_t<Rank == 6> execute_tile(const FunctorType& functor, - const Policy& policy, - pointer_type ptr) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - const Index begin_5 = policy.m_lower[5]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - const Index end_5 = policy.m_upper[5]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); - } - } - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same<typename Policy::work_tag, - void>::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); - } - } - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template <typename Policy, typename Functor> - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 672271ed6b909a6f05b9397b3b3a600e9000b895..4308fb042a34cd71ed0a1d873782b29e18fba46d 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -18,7 +18,6 @@ #define KOKKOS_OPENMPTARGETREDUCER_HPP #include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_Spinwait.hpp> #include <Kokkos_Atomic.hpp> #include "Kokkos_OpenMPTarget_Abort.hpp" @@ -35,9 +34,6 @@ struct OpenMPTargetReducerWrapper { KOKKOS_INLINE_FUNCTION static void join(value_type&, const value_type&) = delete; - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - KOKKOS_INLINE_FUNCTION static void init(value_type&) = delete; }; @@ -52,11 +48,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest += src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::sum(); @@ -73,11 +64,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest *= src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::prod(); @@ -96,11 +82,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> { if (src < dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::min(); @@ -119,11 +100,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> { if (src > dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - // Required KOKKOS_INLINE_FUNCTION static void init(value_type& val) { @@ -142,11 +118,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> { dest = dest && src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::land(); @@ -167,11 +138,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> { dest = dest || src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::lor(); @@ -190,11 +156,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> { dest = dest & src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::band(); @@ -213,11 +174,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> { dest = dest | src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity<value_type>::bor(); @@ -237,12 +193,12 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> { // Required KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity<index_type>::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -264,12 +220,12 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity<index_type>::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -299,16 +255,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity<scalar_type>::max(); @@ -332,22 +278,16 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity<index_type>::min()) { dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity<index_type>::min()) { + dest.max_loc = src.max_loc; } } @@ -386,15 +326,6 @@ struct OpenMPTargetReducerWrapper<MaxFirstLoc<Scalar, Index, Space>> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity<scalar_type>::max(); @@ -429,15 +360,6 @@ struct OpenMPTargetReducerWrapper<MinFirstLoc<Scalar, Index, Space>> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity<scalar_type>::min(); @@ -481,23 +403,6 @@ struct OpenMPTargetReducerWrapper<MinMaxFirstLastLoc<Scalar, Index, Space>> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity<scalar_type>::max(); @@ -532,13 +437,6 @@ struct OpenMPTargetReducerWrapper<FirstLoc<Index, Space>> { : dest.min_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_true = reduction_identity<index_type>::min(); @@ -570,13 +468,6 @@ struct OpenMPTargetReducerWrapper<LastLoc<Index, Space>> { : dest.max_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = reduction_identity<index_type>::max(); @@ -612,17 +503,6 @@ struct OpenMPTargetReducerWrapper<StdIsPartitioned<Index, Space>> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = ::Kokkos::reduction_identity<index_type>::max(); @@ -655,13 +535,6 @@ struct OpenMPTargetReducerWrapper<StdPartitionPoint<Index, Space>> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_false = ::Kokkos::reduction_identity<index_type>::min(); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp deleted file mode 100644 index 458c4c9a43e617b0ad3c6bcecf823acc931fb109..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +++ /dev/null @@ -1,251 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include <Kokkos_Core.hpp> - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) - -#include <impl/Kokkos_TaskQueue_impl.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template class TaskQueue<Kokkos::Experimental::OpenMPTarget>; - -//---------------------------------------------------------------------------- - -TaskExec<Kokkos::Experimental::OpenMPTarget>::TaskExec() - : m_self_exec(0), - m_team_exec(0), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(0), - m_team_rank(0), - m_team_size(1) {} - -TaskExec<Kokkos::Experimental::OpenMPTarget>::TaskExec( - Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) - : m_self_exec(&arg_exec), - m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), - m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), - m_team_size(arg_team_size) { - // This team spans - // m_self_exec->pool_rev( team_size * group_rank ) - // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) - - int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); - - sync[0] = int64_t(0); - sync[1] = int64_t(0); - - for (int i = 0; i < m_team_size; ++i) { - m_sync_value |= int64_t(1) << (8 * i); - m_sync_mask |= int64_t(3) << (8 * i); - } - - Kokkos::memory_fence(); -} - -void TaskExec<Kokkos::Experimental::OpenMPTarget>::team_barrier_impl() const { - if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { - Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small"); - } - - // Use team shared memory to synchronize. - // Alternate memory locations between barriers to avoid a sequence - // of barriers overtaking one another. - - int64_t volatile *const sync = - ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); - - // This team member sets one byte within the sync variable - int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; - - *sync_self = int8_t(m_sync_value & 0x03); // signal arrival - - while (m_sync_value != *sync) - ; // wait for team to arrive - - ++m_sync_step; - - if (0 == (0x01 & m_sync_step)) { // Every other step - m_sync_value ^= m_sync_mask; - if (1000 < m_sync_step) m_sync_step = 0; - } -} - -//---------------------------------------------------------------------------- - -void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>::execute( - TaskQueue<Kokkos::Experimental::OpenMPTarget> *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue<execution_space>; - using task_root_type = TaskBase<execution_space, void, void>; - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - using Member = TaskExec<execution_space>; - - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - // Required: team_size <= 8 - - const int team_size = PoolExec::pool_size(2); // Threads per core - // const int team_size = PoolExec::pool_size(1); // Threads per NUMA - - if (8 < team_size) { - Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size"); - } - -#pragma omp parallel - { - PoolExec &self = *PoolExec::get_thread_omp(); - - Member single_exec; - Member team_exec(self, team_size); - - // Team shared memory - task_root_type *volatile *const task_shared = - (task_root_type **)team_exec.m_team_exec->scratch_thread(); - -// Barrier across entire OpenMPTarget thread pool to insure initialization -#pragma omp barrier - - // Loop until all queues are empty and no tasks in flight - - do { - task_root_type *task = 0; - - // Each team lead attempts to acquire either a thread team task - // or a single thread task for the team. - - if (0 == team_exec.team_rank()) { - task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - } - - // Team lead broadcast acquired task to team members: - - if (1 < team_exec.team_size()) { - if (0 == team_exec.team_rank()) *task_shared = task; - - // Fence to be sure task_shared is stored before the barrier - Kokkos::memory_fence(); - - // Whole team waits for every team member to reach this statement - team_exec.team_barrier(); - - // Fence to be sure task_shared is stored - Kokkos::memory_fence(); - - task = *task_shared; - } - - if (0 == task) break; // 0 == m_ready_count - - if (end == task) { - // All team members wait for whole team to reach this statement. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } else if (task_root_type::TaskTeam == task->m_task_type) { - // Thread Team Task - (*task->m_apply)(task, &team_exec); - - // The m_apply function performs a barrier - - if (0 == team_exec.team_rank()) { - // team member #0 completes the task, which may delete the task - queue->complete(task); - } - } else { - // Single Thread Task - - if (0 == team_exec.team_rank()) { - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - } - - // All team members wait for whole team to reach this statement. - // Not necessary to complete the task. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } - } while (1); - } - // END #pragma omp parallel -} - -void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>:: - iff_single_thread_recursive_execute( - TaskQueue<Kokkos::Experimental::OpenMPTarget> *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue<execution_space>; - using task_root_type = TaskBase<execution_space, void, void>; - using Member = TaskExec<execution_space>; - - if (1 == omp_get_num_threads()) { - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - Member single_exec; - - task_root_type *task = end; - - do { - task = end; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - - if (end == task) break; - - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - - } while (1); - } -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ - KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp deleted file mode 100644 index c9aa7b128f17eda3782b903c07c001217b406acd..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp +++ /dev/null @@ -1,319 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP -#define KOKKOS_IMPL_OPENMP_TASK_HPP - -#if defined(KOKKOS_ENABLE_TASKPOLICY) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <> -class TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget> { - public: - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = Kokkos::Impl::TaskQueue<execution_space>; - using task_base_type = Kokkos::Impl::TaskBase<execution_space, void, void>; - - // Must specify memory space - using memory_space = Kokkos::HostSpace; - - static void iff_single_thread_recursive_execute(queue_type* const); - - // Must provide task queue execution function - static void execute(queue_type* const); - - // Must provide mechanism to set function pointer in - // execution space from the host process. - template <typename FunctorType> - static void proc_set_apply(task_base_type::function_type* ptr) { - using TaskType = TaskBase<Kokkos::Experimental::OpenMPTarget, - typename FunctorType::value_type, FunctorType>; - *ptr = TaskType::apply; - } -}; - -extern template class TaskQueue<Kokkos::Experimental::OpenMPTarget>; - -//---------------------------------------------------------------------------- - -template <> -class TaskExec<Kokkos::Experimental::OpenMPTarget> { - private: - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; - TaskExec& operator=(TaskExec const&) = delete; - - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - - friend class Kokkos::Impl::TaskQueue<Kokkos::Experimental::OpenMPTarget>; - friend class Kokkos::Impl::TaskQueueSpecialization< - Kokkos::Experimental::OpenMPTarget>; - - PoolExec* const m_self_exec; ///< This thread's thread pool data structure - PoolExec* const m_team_exec; ///< Team thread's thread pool data structure - int64_t m_sync_mask; - int64_t mutable m_sync_value; - int mutable m_sync_step; - int m_group_rank; ///< Which "team" subset of thread pool - int m_team_rank; ///< Which thread within a team - int m_team_size; - - TaskExec(); - TaskExec(PoolExec& arg_exec, int arg_team_size); - - void team_barrier_impl() const; - - public: - KOKKOS_FUNCTION void* team_shared() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread() : nullptr;)) - - KOKKOS_IF_ON_DEVICE((return nullptr;)) - } - - KOKKOS_FUNCTION int team_shared_size() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread_size() : 0;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /**\brief Whole team enters this function call - * before any teeam member returns from - * this function call. - */ - KOKKOS_FUNCTION void team_barrier() const { - KOKKOS_IF_ON_HOST((if (1 < m_team_size) { team_barrier_impl(); })) - } - - KOKKOS_INLINE_FUNCTION - int team_rank() const { return m_team_rank; } - - KOKKOS_INLINE_FUNCTION - int team_size() const { return m_team_size; } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template <typename iType> -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> > -TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread, - count); -} - -template <typename iType> -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> > -TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread, - const iType& start, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread, start, - end); -} - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template <typename iType, class Lambda> -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda) { - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i); - } -} - -template <typename iType, class Lambda, typename ValueType> -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - shared[0] += shared[i]; - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -template <typename iType, class Lambda, typename ValueType, class JoinType> -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - join(shared[0], shared[i]); - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -// placeholder for future function -template <typename iType, class Lambda, typename ValueType> -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) {} - -// placeholder for future function -template <typename iType, class Lambda, typename ValueType, class JoinType> -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { -} - -template <typename ValueType, typename iType, class Lambda> -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda) { - ValueType accum = 0; - ValueType val, local_total; - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - int team_size = loop_boundaries.thread.team_size(); - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - - // Intra-member scan - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } - - shared[team_rank] = accum; - loop_boundaries.thread.team_barrier(); - - // Member 0 do scan on accumulated totals - if (team_rank == 0) { - for (iType i = 1; i < team_size; i += 1) { - shared[i] += shared[i - 1]; - } - accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan - } - - loop_boundaries.thread.team_barrier(); - - // Inter-member scan adding in accumulated totals - if (team_rank != 0) { - accum = shared[team_rank - 1]; - } - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } -} - -// placeholder for future function -template <typename iType, class Lambda, typename ValueType> -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& - loop_boundaries, - const Lambda& lambda) {} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 7fa935f693a431f1cfc116cddd5208f289ca5840..2583a1cdc0473de055916edbe0b88f4052a3c887 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -46,7 +46,6 @@ struct Container { } // namespace namespace Kokkos { -namespace Experimental { SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton(), [](Impl::SYCLInternal*) {}) { @@ -88,26 +87,87 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } void SYCL::print_configuration(std::ostream& os, bool verbose) const { - os << "Devices:\n"; - os << " KOKKOS_ENABLE_SYCL: yes\n"; - os << "\nRuntime Configuration:\n"; - os << "macro KOKKOS_ENABLE_SYCL : defined\n"; +#ifdef KOKKOS_ENABLE_ONEDPL + os << "macro KOKKOS_ENABLE_ONEDPL : defined\n"; +#else + os << "macro KOKKOS_ENABLE_ONEDPL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif - +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : defined\n"; +#else + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : undefined\n"; +#endif +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif +#ifdef SYCL_EXT_ONEAPI_GRAPH + os << "macro SYCL_EXT_ONEAPI_GRAPH : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_GRAPH : undefined\n"; +#endif +#ifdef SYCL_EXT_INTEL_QUEUE_IMMEDIATE_COMMAND_LIST + if (sycl_queue() + .has_property< + sycl::ext::intel::property::queue::immediate_command_list>()) + os << "Immediate command lists enforced\n"; + else if (sycl_queue() + .has_property<sycl::ext::intel::property::queue:: + no_immediate_command_list>()) + os << "Standard command queue enforced\n"; + else +#endif + { + os << "Immediate command lists and standard command queue allowed.\n"; + if (const char* environment_setting = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS")) + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=" + << environment_setting << " takes precedence.\n"; + else + os << "SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS not defined.\n"; + } + + int counter = 0; + int active_device = Kokkos::device_id(); + std::cout << "\nAvailable devices: \n"; + std::vector<sycl::device> devices = Impl::get_sycl_devices(); + for (const auto& device : devices) { + std::string device_type; + switch (device.get_info<sycl::info::device::device_type>()) { + case sycl::info::device_type::cpu: device_type = "cpu"; break; + case sycl::info::device_type::gpu: device_type = "gpu"; break; + case sycl::info::device_type::accelerator: + device_type = "accelerator"; + break; + case sycl::info::device_type::custom: device_type = "custom"; break; + case sycl::info::device_type::automatic: device_type = "automatic"; break; + case sycl::info::device_type::host: device_type = "host"; break; + case sycl::info::device_type::all: device_type = "all"; break; + } + os << "[" << device.get_backend() << "]:" << device_type << ':' << counter + << "] " << device.get_info<sycl::info::device::name>(); + if (counter == active_device) os << " : Selected"; + os << '\n'; + ++counter; + } - if (verbose) + if (verbose) { + os << '\n'; SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device()); + } } void SYCL::fence(const std::string& name) const { @@ -116,8 +176,7 @@ void SYCL::fence(const std::string& name) const { } void SYCL::impl_static_fence(const std::string& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::SYCL>( name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, @@ -137,20 +196,11 @@ void SYCL::impl_static_fence(const std::string& name) { } void SYCL::impl_initialize(InitializationSettings const& settings) { - std::vector<sycl::device> gpu_devices = - sycl::device::get_devices(sycl::info::device_type::gpu); - // If the device id is not specified and there are no GPUs, sidestep Kokkos - // device selection and use whatever is available (if no GPU architecture is - // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - Impl::SYCLInternal::m_syclDev = 0; - return; - } -#endif - const auto id = ::Kokkos::Impl::get_gpu(settings); - Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); + const auto& visible_devices = ::Kokkos::Impl::get_visible_devices(); + const auto id = + ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); + std::vector<sycl::device> sycl_devices = Impl::get_sycl_devices(); + Impl::SYCLInternal::singleton().initialize(sycl_devices[id]); Impl::SYCLInternal::m_syclDev = id; } @@ -214,8 +264,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << device.get_info<device::image3d_max_depth>() << "\nImage Max Buffer Size: " << device.get_info<device::image_max_buffer_size>() - << "\nImage Max Array Size: " - << device.get_info<device::image_max_array_size>() << "\nMax Samplers: " << device.get_info<device::max_samplers>() << "\nMax Parameter Size: " << device.get_info<device::max_parameter_size>() @@ -243,9 +291,31 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, namespace Impl { +std::vector<sycl::device> get_sycl_devices() { +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \ + defined(KOKKOS_ARCH_AMD_GPU) + std::vector<sycl::device> devices = + sycl::device::get_devices(sycl::info::device_type::gpu); +#if defined(KOKKOS_ARCH_INTEL_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_level_zero; +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_cuda; +#elif defined(KOKKOS_ARCH_AMD_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_hip; +#endif + devices.erase(std::remove_if(devices.begin(), devices.end(), + [backend](const sycl::device& d) { + return d.get_backend() != backend; + }), + devices.end()); +#else + std::vector<sycl::device> devices = sycl::device::get_devices(); +#endif + return devices; +} + int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory<SYCL>("170_SYCL"); -} -} // namespace Experimental +} // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index be6b4b8930283aaa0e6ac67cb4b873af9bfae17d..937dcceab4832090666367d65f53ce6156e77d37 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -39,7 +39,6 @@ static_assert(false, #include <impl/Kokkos_InitializationSettings.hpp> namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal; } @@ -78,26 +77,21 @@ class SYCL { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__SYCL_DEVICE_ONLY__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. */ - static bool sleep(); - - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); +#endif /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); - void fence( - const std::string& name = - "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const; + void fence(const std::string& name = + "Kokkos::SYCL::fence: Unnamed Instance Fence") const; /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -135,15 +129,13 @@ class SYCL { Kokkos::Impl::HostSharedPtr<Impl::SYCLInternal> m_space_instance; }; -} // namespace Experimental - namespace Tools { namespace Experimental { template <> -struct DeviceTypeTraits<Kokkos::Experimental::SYCL> { +struct DeviceTypeTraits<Kokkos::SYCL> { /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling static constexpr DeviceType id = DeviceType::SYCL; - static int device_id(const Kokkos::Experimental::SYCL& exec) { + static int device_id(const Kokkos::SYCL& exec) { return exec.impl_internal_space_instance()->m_syclDev; } }; @@ -188,8 +180,13 @@ std::vector<SYCL> partition_space(const SYCL& sycl_space, sycl::queue(context, device, sycl::property::queue::in_order())); return instances; } + } // namespace Experimental +namespace Impl { +std::vector<sycl::device> get_sycl_devices(); +} // namespace Impl + } // namespace Kokkos #endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index afc7eebd38817c0f25fcc64b1201aa9ce28ea58d..a9e2eca4fb3a764885d3ff6c772e99ef6610398b 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -28,37 +28,34 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n); -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n); +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n); void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n); template <class MemSpace> -struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::SYCL, +struct DeepCopy<MemSpace, HostSpace, Kokkos::SYCL, std::enable_if_t<is_sycl_type_space<MemSpace>::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template <class MemSpace> -struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::SYCL, +struct DeepCopy<HostSpace, MemSpace, Kokkos::SYCL, std::enable_if_t<is_sycl_type_space<MemSpace>::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template <class MemSpace1, class MemSpace2> -struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::SYCL, +struct DeepCopy<MemSpace1, MemSpace2, Kokkos::SYCL, std::enable_if_t<is_sycl_type_space<MemSpace1>::value && is_sycl_type_space<MemSpace2>::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; @@ -66,10 +63,9 @@ struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::SYCL, template <class MemSpace1, class MemSpace2, class ExecutionSpace> struct DeepCopy< MemSpace1, MemSpace2, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space<MemSpace1>::value && - is_sycl_type_space<MemSpace2>::value && - !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> { + std::enable_if_t<is_sycl_type_space<MemSpace1>::value && + is_sycl_type_space<MemSpace2>::value && + !std::is_same<ExecutionSpace, Kokkos::SYCL>::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -93,9 +89,8 @@ struct DeepCopy< template <class MemSpace, class ExecutionSpace> struct DeepCopy< MemSpace, HostSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space<MemSpace>::value && - !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> { + std::enable_if_t<is_sycl_type_space<MemSpace>::value && + !std::is_same<ExecutionSpace, Kokkos::SYCL>::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -118,9 +113,8 @@ struct DeepCopy< template <class MemSpace, class ExecutionSpace> struct DeepCopy< HostSpace, MemSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space<MemSpace>::value && - !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> { + std::enable_if_t<is_sycl_type_space<MemSpace>::value && + !std::is_same<ExecutionSpace, Kokkos::SYCL>::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..54ca64599532633839f1d93db6cdfd1eeb83b215 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_GRAPHNODEKERNEL_HPP +#define KOKKOS_SYCL_GRAPHNODEKERNEL_HPP + +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_GraphImpl.hpp> + +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Parallel_Reduce.hpp> +#include <Kokkos_PointerOwnership.hpp> + +#include <SYCL/Kokkos_SYCL_GraphNode_Impl.hpp> + +namespace Kokkos { +namespace Impl { + +template <typename PolicyType, typename Functor, typename PatternTag, + typename... Args> +class GraphNodeKernelImpl<Kokkos::SYCL, PolicyType, Functor, PatternTag, + Args...> + : public PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, + Args..., Kokkos::SYCL>::type { + public: + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + using base_t = + typename PatternImplSpecializationFromTag<PatternTag, Functor, Policy, + Args..., Kokkos::SYCL>::type; + + // TODO use the name and executionspace + template <typename PolicyDeduced, typename... ArgsDeduced> + GraphNodeKernelImpl(std::string, Kokkos::SYCL const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...) {} + + template <typename PolicyDeduced> + GraphNodeKernelImpl(Kokkos::SYCL const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) + : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} + + void set_sycl_graph_ptr( + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>* + arg_graph) { + m_graph_ptr = arg_graph; + } + + void set_sycl_graph_node_ptr( + std::optional<sycl::ext::oneapi::experimental::node>* arg_node) { + m_graph_node_ptr = arg_node; + } + + std::optional<sycl::ext::oneapi::experimental::node>& get_sycl_graph_node() + const { + return *m_graph_node_ptr; + } + + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>& + get_sycl_graph() const { + return *m_graph_ptr; + } + + private: + Kokkos::ObservingRawPtr<sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>> + m_graph_ptr = nullptr; + Kokkos::ObservingRawPtr<std::optional<sycl::ext::oneapi::experimental::node>> + m_graph_node_ptr = nullptr; +}; + +struct SYCLGraphNodeAggregateKernel { + using graph_kernel = SYCLGraphNodeAggregateKernel; + + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kernels, + struct Policy { + using is_graph_kernel = std::true_type; + }; +}; + +template <typename KernelType, + typename Tag = + typename PatternTagFromImplSpecialization<KernelType>::type> +struct get_graph_node_kernel_type + : type_identity< + GraphNodeKernelImpl<Kokkos::SYCL, typename KernelType::Policy, + typename KernelType::functor_type, Tag>> {}; + +template <typename KernelType> +struct get_graph_node_kernel_type<KernelType, Kokkos::ParallelReduceTag> + : type_identity<GraphNodeKernelImpl< + Kokkos::SYCL, typename KernelType::Policy, + CombinedFunctorReducer<typename KernelType::FunctorType, + typename KernelType::ReducerType>, + Kokkos::ParallelReduceTag>> {}; + +template <typename KernelType> +auto& get_sycl_graph_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type<KernelType>::type; + auto const& kernel_as_graph_kernel = + static_cast<graph_node_kernel_t const&>(kernel); + auto& graph = kernel_as_graph_kernel.get_sycl_graph(); + + return graph; +} + +template <typename KernelType> +auto& get_sycl_graph_node_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type<KernelType>::type; + auto const& kernel_as_graph_kernel = + static_cast<graph_node_kernel_t const&>(kernel); + auto& graph_node = kernel_as_graph_kernel.get_sycl_graph_node(); + + return graph_node; +} + +template <typename Kernel, typename Lambda> +void sycl_attach_kernel_to_node(Kernel& kernel, const Lambda& lambda) { + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable>& graph = + Impl::get_sycl_graph_from_kernel(kernel); + std::optional<sycl::ext::oneapi::experimental::node>& graph_node = + Impl::get_sycl_graph_node_from_kernel(kernel); + KOKKOS_ENSURES(!graph_node); + graph_node = graph.add(lambda); + KOKKOS_ENSURES(graph_node); + // FIXME_SYCL_GRAPH not yet implemented in the compiler + // KOKKOS_ENSURES(graph_node.get_type() == + // sycl::ext::oneapi::experimental::node_type::kernel) +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..828f1cacb4ac595f1573fc1575721626a6d6fa66 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_GRAPHNODE_IMPL_HPP +#define KOKKOS_SYCL_GRAPHNODE_IMPL_HPP + +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_GraphImpl.hpp> + +#include <SYCL/Kokkos_SYCL.hpp> + +#include <optional> + +namespace Kokkos { +namespace Impl { +template <> +struct GraphNodeBackendSpecificDetails<Kokkos::SYCL> { + std::optional<sycl::ext::oneapi::experimental::node> node; + + explicit GraphNodeBackendSpecificDetails() = default; + + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept {} +}; + +template <typename Kernel, typename PredecessorRef> +struct GraphNodeBackendDetailsBeforeTypeErasure<Kokkos::SYCL, Kernel, + PredecessorRef> { + protected: + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails<Kokkos::SYCL> &) noexcept {} + + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails<Kokkos::SYCL> &) noexcept {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dc63052dd7a7df5a219640fd9cdb43414b5baf00 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -0,0 +1,178 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SYCL_GRAPH_IMPL_HPP +#define KOKKOS_SYCL_GRAPH_IMPL_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_GraphImpl.hpp> +#include <impl/Kokkos_GraphNodeImpl.hpp> + +#include <SYCL/Kokkos_SYCL_GraphNodeKernel.hpp> + +#include <optional> + +namespace Kokkos { +namespace Impl { +template <> +class GraphImpl<Kokkos::SYCL> { + public: + using node_details_t = GraphNodeBackendSpecificDetails<Kokkos::SYCL>; + using root_node_impl_t = + GraphNodeImpl<Kokkos::SYCL, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag>; + using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; + using aggregate_node_impl_t = + GraphNodeImpl<Kokkos::SYCL, aggregate_kernel_impl_t, + Kokkos::Experimental::TypeErasedTag>; + + // Not movable or copyable; it spends its whole life as a shared_ptr in the + // Graph object. + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + + ~GraphImpl(); + + explicit GraphImpl(Kokkos::SYCL instance); + + void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr); + + template <class NodeImpl> + void add_node(std::shared_ptr<NodeImpl> const& arg_node_ptr); + + template <class NodeImplPtr, class PredecessorRef> + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); + + void submit(const Kokkos::SYCL& exec); + + Kokkos::SYCL const& get_execution_space() const noexcept; + + auto create_root_node_ptr(); + + template <class... PredecessorRefs> + auto create_aggregate_ptr(PredecessorRefs&&...); + + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec.has_value()); + m_graph_exec = m_graph.finalize(); + } + + auto& sycl_graph() { return m_graph; } + auto& sycl_graph_exec() { return m_graph_exec; } + + private: + Kokkos::SYCL m_execution_space; + sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::modifiable> + m_graph; + std::optional<sycl::ext::oneapi::experimental::command_graph< + sycl::ext::oneapi::experimental::graph_state::executable>> + m_graph_exec; +}; + +inline GraphImpl<Kokkos::SYCL>::~GraphImpl() { + m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); +} + +inline GraphImpl<Kokkos::SYCL>::GraphImpl(Kokkos::SYCL instance) + : m_execution_space(std::move(instance)), + m_graph(m_execution_space.sycl_queue().get_context(), + m_execution_space.sycl_queue().get_device()) {} + +inline void GraphImpl<Kokkos::SYCL>::add_node( + std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) { + // add an empty node that needs to be set up before finalizing the graph + arg_node_ptr->node_details_t::node = m_graph.add(); +} + +// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl +// Also requires that the kernel has the graph node tag in its policy +template <class NodeImpl> +inline void GraphImpl<Kokkos::SYCL>::add_node( + std::shared_ptr<NodeImpl> const& arg_node_ptr) { + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); + KOKKOS_EXPECTS(arg_node_ptr); + // The Kernel launch from the execute() method has been shimmed to insert + // the node into the graph + auto& kernel = arg_node_ptr->get_kernel(); + auto& node = static_cast<node_details_t*>(arg_node_ptr.get())->node; + KOKKOS_EXPECTS(!node); + kernel.set_sycl_graph_ptr(&m_graph); + kernel.set_sycl_graph_node_ptr(&node); + kernel.execute(); + KOKKOS_ENSURES(node); +} + +// Requires PredecessorRef is a specialization of GraphNodeRef that has +// already been added to this graph and NodeImpl is a specialization of +// GraphNodeImpl that has already been added to this graph. +template <class NodeImplPtr, class PredecessorRef> +inline void GraphImpl<Kokkos::SYCL>::add_predecessor( + NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { + KOKKOS_EXPECTS(arg_node_ptr); + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + KOKKOS_EXPECTS(pred_ptr); + + auto& pred_node = pred_ptr->node_details_t::node; + KOKKOS_EXPECTS(pred_node); + + auto& node = arg_node_ptr->node_details_t::node; + KOKKOS_EXPECTS(node); + + m_graph.make_edge(*pred_node, *node); +} + +inline void GraphImpl<Kokkos::SYCL>::submit(const Kokkos::SYCL& exec) { + if (!m_graph_exec) { + instantiate(); + } + exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); +} + +inline Kokkos::SYCL const& GraphImpl<Kokkos::SYCL>::get_execution_space() + const noexcept { + return m_execution_space; +} + +inline auto GraphImpl<Kokkos::SYCL>::create_root_node_ptr() { + KOKKOS_EXPECTS(!m_graph_exec); + auto rv = std::make_shared<root_node_impl_t>(get_execution_space(), + _graph_node_is_root_ctor_tag{}); + rv->node_details_t::node = m_graph.add(); + return rv; +} + +template <class... PredecessorRefs> +inline auto GraphImpl<Kokkos::SYCL>::create_aggregate_ptr( + PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + return std::make_shared<aggregate_node_impl_t>(m_execution_space, + _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); +} +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 080369770d75d9b755e5fa34bab00b7e52aae755..5af1330d9390f45ed2265bf68c60516ae81ac32d 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -24,14 +24,12 @@ #include <impl/Kokkos_Error.hpp> namespace Kokkos { -namespace Experimental { namespace Impl { namespace { // FIXME_SYCL Should be a multiple of the maximum subgroup size. -static constexpr auto sizeScratchGrain = - sizeof(Kokkos::Experimental::SYCL::size_type[32]); +static constexpr auto sizeScratchGrain = sizeof(Kokkos::SYCL::size_type[32]); std::size_t scratch_count(const std::size_t size) { return (size + sizeScratchGrain - 1) / sizeScratchGrain; @@ -54,9 +52,9 @@ Kokkos::View<uint32_t*, SYCLDeviceUSMSpace> sycl_global_unique_token_locks( } SYCLInternal::~SYCLInternal() { - if (!was_finalized || m_scratchSpace || m_scratchFlags) { - std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " - "Kokkos::Experimental::SYCL::finalize()" + if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { + std::cerr << "Kokkos::SYCL ERROR: Failed to call " + "Kokkos::SYCL::finalize()" << std::endl; std::cerr.flush(); } @@ -64,7 +62,7 @@ SYCLInternal::~SYCLInternal() { int SYCLInternal::verify_is_initialized(const char* const label) const { if (!is_initialized()) { - Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label + + Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } @@ -102,6 +100,23 @@ void SYCLInternal::initialize(const sycl::device& d) { void SYCLInternal::initialize(const sycl::queue& q) { KOKKOS_EXPECTS(!is_initialized()); +#define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED) \ + if (BACKEND != REQUIRED) \ + Kokkos::abort( \ + "The SYCL execution space instance was initialized with an " \ + "unsupported backend type! For this GPU architecture, only " #REQUIRED \ + " is supported.") +#if defined(KOKKOS_ARCH_INTEL_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_level_zero); +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_cuda); +#elif defined(KOKKOS_ARCH_AMD_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_hip); +#endif + if (was_finalized) Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); @@ -149,26 +164,27 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -sycl::device_ptr<void> SYCLInternal::resize_team_scratch_space( +Kokkos::Impl::sycl_device_ptr<void> SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - if (m_team_scratch_current_size[scratch_pool_id] == 0) { + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); + if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc<Experimental::SYCLDeviceUSMSpace>( - "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc<Experimental::SYCLDeviceUSMSpace>( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -196,21 +212,29 @@ void SYCLInternal::finalize() { #endif } - using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>; + auto device_mem_space = SYCLDeviceUSMSpace(*m_queue); + auto host_mem_space = SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + if (nullptr != m_scratchHost) + host_mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); if (nullptr != m_scratchFlags) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); m_syclDev = -1; m_scratchSpaceCount = 0; m_scratchSpace = nullptr; + m_scratchHostCount = 0; + m_scratchHost = nullptr; m_scratchFlagsCount = 0; m_scratchFlags = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) { - Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>( - m_team_scratch_ptr[i]); + device_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; } @@ -225,57 +249,74 @@ void SYCLInternal::finalize() { m_queue.reset(); } -sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) { +Kokkos::Impl::sycl_device_ptr<void> SYCLInternal::scratch_space( + const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - Record::decrement(Record::get_record(m_scratchSpace)); + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size); + m_scratchSpace = static_cast<size_type*>( + mem_space.allocate("Kokkos::SYCL::InternalScratchSpace", alloc_size)); + } + + return m_scratchSpace; +} + +Kokkos::Impl::sycl_host_ptr<void> SYCLInternal::scratch_host( + const std::size_t size) { + if (verify_is_initialized("scratch_unified") && + m_scratchHostCount < scratch_count(size)) { + auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); - Record::increment(r); + if (nullptr != m_scratchHost) + mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); - m_scratchSpace = reinterpret_cast<size_type*>(r->data()); + m_scratchHostCount = scratch_count(size); + + std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( + m_scratchHostCount, sizeScratchGrain); + m_scratchHost = static_cast<size_type*>( + mem_space.allocate("Kokkos::SYCL::InternalScratchHost", alloc_size)); } - return m_scratchSpace; + return m_scratchHost; } -sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) { +Kokkos::Impl::sycl_device_ptr<void> SYCLInternal::scratch_flags( + const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) - Record::decrement(Record::get_record(m_scratchFlags)); + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast<size_type*>(r->data()); - } - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + m_scratchFlags = static_cast<size_type*>( + mem_space.allocate("Kokkos::SYCL::InternalScratchFlags", alloc_size)); + + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); #endif + } return m_scratchFlags; } @@ -283,8 +324,7 @@ sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) { template <typename WAT> void SYCLInternal::fence_helper(WAT& wat, const std::string& name, uint32_t instance_id) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::SYCL>( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id}, [&]() { try { @@ -318,15 +358,11 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) { assert(m_q); if (m_capacity < n) { - using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>; - // First free what we have (in case malloc can reuse it) - if (m_data) Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + if (m_data) alloc_space.deallocate(m_data, m_capacity); - Record* const r = Record::allocate( - AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n); - Record::increment(r); + m_data = alloc_space.allocate("Kokkos::SYCL::USMObjectMem", n); - m_data = r->data(); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); m_capacity = n; @@ -340,8 +376,8 @@ void SYCLInternal::USMObjectMem<Kind>::reset() { if (m_data) { // This implies a fence since this class is not copyable // and deallocating implies a fence across all registered queues. - using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>; - Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + alloc_space.deallocate(m_data, m_capacity); m_capacity = 0; m_data = nullptr; @@ -356,5 +392,4 @@ template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>; template class SYCLInternal::USMObjectMem<sycl::usm::alloc::host>; } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 51a617054d6db8ca15dce2bf7d7d236f33dc2425..c982154a9a825a8ae175de9bc305709aad628700 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -28,7 +28,6 @@ #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Profiling.hpp> namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal { @@ -38,17 +37,17 @@ class SYCLInternal { SYCLInternal() = default; ~SYCLInternal(); - SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal(const SYCLInternal&) = delete; SYCLInternal& operator=(const SYCLInternal&) = delete; - SYCLInternal& operator=(SYCLInternal&&) = delete; - SYCLInternal(SYCLInternal&&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; - sycl::device_ptr<void> scratch_space(const std::size_t size); - sycl::device_ptr<void> scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_device_ptr<void> scratch_space(const std::size_t size); + Kokkos::Impl::sycl_device_ptr<void> scratch_flags(const std::size_t size); + Kokkos::Impl::sycl_host_ptr<void> scratch_host(const std::size_t size); int acquire_team_scratch_space(); - sycl::device_ptr<void> resize_team_scratch_space(int scratch_pool_id, - std::int64_t bytes, - bool force_shrink = false); + Kokkos::Impl::sycl_device_ptr<void> resize_team_scratch_space( + int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -58,23 +57,27 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - sycl::device_ptr<size_type> m_scratchSpace = nullptr; - std::size_t m_scratchFlagsCount = 0; - sycl::device_ptr<size_type> m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + Kokkos::Impl::sycl_device_ptr<size_type> m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + Kokkos::Impl::sycl_host_ptr<size_type> m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + Kokkos::Impl::sycl_device_ptr<size_type> m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable sycl::device_ptr<void> m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable Kokkos::Impl::sycl_device_ptr<void> + m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::SYCL>(reinterpret_cast<uintptr_t>(this)); + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance<Kokkos::SYCL>( + reinterpret_cast<uintptr_t>(this)); std::optional<sycl::queue> m_queue; // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when @@ -99,9 +102,9 @@ class SYCLInternal { explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept : m_q(std::move(q)), m_instance_id(instance_id) {} - USMObjectMem(USMObjectMem const&) = delete; - USMObjectMem(USMObjectMem&&) = delete; - USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; USMObjectMem& operator=(USMObjectMem const&) = delete; ~USMObjectMem() { reset(); }; @@ -116,12 +119,12 @@ class SYCLInternal { size_t reserve(size_t n); private: - using AllocationSpace = std::conditional_t< - Kind == sycl::usm::alloc::device, - Kokkos::Experimental::SYCLDeviceUSMSpace, - std::conditional_t<Kind == sycl::usm::alloc::shared, - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>>; + using AllocationSpace = + std::conditional_t<Kind == sycl::usm::alloc::device, + Kokkos::SYCLDeviceUSMSpace, + std::conditional_t<Kind == sycl::usm::alloc::shared, + Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLHostUSMSpace>>; public: // Performs either sycl::memcpy (for USM device memory) or std::memcpy @@ -141,11 +144,10 @@ class SYCLInternal { } void fence() { - SYCLInternal::fence( - m_last_event, - "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for " - "last event to finish", - m_instance_id); + SYCLInternal::fence(m_last_event, + "Kokkos::SYCLInternal::USMObject fence to wait for " + "last event to finish", + m_instance_id); } void register_event(sycl::event event) { @@ -321,17 +323,16 @@ auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { return SYCLFunctionWrapper<Functor, Storage>(functor, storage); } } // namespace Impl -} // namespace Experimental } // namespace Kokkos #if defined(SYCL_DEVICE_COPYABLE) && defined(KOKKOS_ARCH_INTEL_GPU) template <typename Functor, typename Storage> struct sycl::is_device_copyable< - Kokkos::Experimental::Impl::SYCLFunctionWrapper<Functor, Storage, false>> + Kokkos::Impl::SYCLFunctionWrapper<Functor, Storage, false>> : std::true_type {}; -// FIXME_SYCL Remove when this specialization when specializations for -// sycl::device_copyable also apply to const-qualified types. +#if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ + (defined(__LIBSYCL_MAJOR_VERSION) && __LIBSYCL_MAJOR_VERSION < 7) template <typename> struct NonTriviallyCopyableAndDeviceCopyable { NonTriviallyCopyableAndDeviceCopyable( @@ -349,10 +350,10 @@ static_assert( template <typename Functor, typename Storage> struct sycl::is_device_copyable< - const Kokkos::Experimental::Impl::SYCLFunctionWrapper<Functor, Storage, - false>, + const Kokkos::Impl::SYCLFunctionWrapper<Functor, Storage, false>, std::enable_if_t<!sycl::is_device_copyable_v< const NonTriviallyCopyableAndDeviceCopyable<Functor>>>> : std::true_type {}; #endif #endif +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index d212e2dacc3aa298d01b1c0a8d75e4897b19b419..9498513a3e8ca559b24cb41ad3cca3f6ac767554 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -22,13 +22,13 @@ namespace Kokkos { template <> -struct default_outer_direction<Kokkos::Experimental::SYCL> { +struct default_outer_direction<Kokkos::SYCL> { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; template <> -struct default_inner_direction<Kokkos::Experimental::SYCL> { +struct default_inner_direction<Kokkos::SYCL> { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; @@ -37,8 +37,8 @@ namespace Impl { // Settings for MDRangePolicy template <> -inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::SYCL>( - const Kokkos::Experimental::SYCL& space) { +inline TileSizeProperties get_tile_size_properties<Kokkos::SYCL>( + const Kokkos::SYCL& space) { TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; @@ -50,8 +50,7 @@ inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::SYCL>( // Settings for TeamMDRangePolicy template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector> -struct ThreadAndVectorNestLevel<Rank, Kokkos::Experimental::SYCL, - ThreadAndVector> +struct ThreadAndVectorNestLevel<Rank, Kokkos::SYCL, ThreadAndVector> : AcceleratorBasedNestLevel<Rank, ThreadAndVector> {}; } // namespace Impl diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index f4fada570b0e6839f0ba20de9d50b4db274d724d..3dbd63d81ad5e956583777aa4f748854905502d9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,7 +25,7 @@ template <class FunctorType, class... Traits> class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy<Traits...>; @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, const typename Policy::index_type m_num_tiles; static constexpr Iterate inner_direction = Policy::inner_direction; } m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; sycl::nd_range<3> compute_ranges() const { const auto& m_tile = m_policy.m_tile; @@ -118,7 +118,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, const BarePolicy bare_policy(m_policy); - auto parallel_for_event = q.submit([&](sycl::handler& cgh) { + desul::ensure_sycl_lock_arrays_on_device(q); + + auto cgh_lambda = [&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); const sycl::range<3> local_range = range.get_local_range(); @@ -151,12 +153,22 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, {global_x, global_y, global_z}, {local_x, local_y, local_z}) .exec_range(); }); - }); -#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event}); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + return {}; + } else #endif + { + auto parallel_for_event = q.submit(cgh_lambda); - return parallel_for_event; +#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES + q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event}); +#endif + return parallel_for_event; + } } public: @@ -168,23 +180,16 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 9c5767d209ff19d8798cec1f8f93bbdbf0a820d4..da75f3e901d12ecc2c8457f90f751b3f5cd7c57e 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -17,11 +17,15 @@ #ifndef KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_ #define KOKKOS_SYCL_PARALLEL_FOR_RANGE_HPP_ +#ifdef SYCL_EXT_ONEAPI_AUTO_LOCAL_RANGE +#include <Kokkos_BitManipulation.hpp> +#endif #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES #include <vector> #endif namespace Kokkos::Impl { +#ifndef SYCL_EXT_ONEAPI_AUTO_LOCAL_RANGE template <typename FunctorWrapper, typename Policy> struct FunctorWrapperRangePolicyParallelFor { using WorkTag = typename Policy::work_tag; @@ -37,14 +41,15 @@ struct FunctorWrapperRangePolicyParallelFor { typename Policy::index_type m_begin; FunctorWrapper m_functor_wrapper; }; +#endif // Same as above but for a user-provided workgroup size template <typename FunctorWrapper, typename Policy> struct FunctorWrapperRangePolicyParallelForCustom { using WorkTag = typename Policy::work_tag; - void operator()(sycl::item<1> item) const { - const typename Policy::index_type id = item.get_linear_id(); + void operator()(sycl::nd_item<1> item) const { + const typename Policy::index_type id = item.get_global_linear_id(); if (id < m_work_size) { const auto shifted_id = id + m_begin; if constexpr (std::is_void_v<WorkTag>) @@ -62,7 +67,7 @@ struct FunctorWrapperRangePolicyParallelForCustom { template <class FunctorType, class... Traits> class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy<Traits...>; @@ -74,25 +79,47 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, const Policy m_policy; template <typename Functor> - static sycl::event sycl_direct_launch(const Policy& policy, - const Functor& functor, - const sycl::event& memcpy_event) { + sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, + const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = policy.space(); + sycl::queue& q = space.sycl_queue(); - auto parallel_for_event = q.submit([&](sycl::handler& cgh) { + desul::ensure_sycl_lock_arrays_on_device(q); + + auto cgh_lambda = [&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); #else (void)memcpy_event; #endif + if (policy.chunk_size() <= 1) { +#ifdef SYCL_EXT_ONEAPI_AUTO_LOCAL_RANGE + const auto actual_range = policy.end() - policy.begin(); + FunctorWrapperRangePolicyParallelForCustom<Functor, Policy> f{ + policy.begin(), functor, actual_range}; + // Round the actual range up to the closest power of two not exceeding + // the maximum workgroup size + const auto max_wgroup_size = + q.get_device().get_info<sycl::info::device::max_work_group_size>(); + const auto wgroup_size_multiple = Kokkos::bit_floor( + std::min<std::size_t>(max_wgroup_size, actual_range)); + + const auto launch_range = (actual_range + wgroup_size_multiple - 1) / + wgroup_size_multiple * wgroup_size_multiple; + sycl::nd_range<1> range( + launch_range, sycl::ext::oneapi::experimental::auto_range<1>()); + cgh.parallel_for< + FunctorWrapperRangePolicyParallelForCustom<Functor, Policy>>(range, + f); +#else FunctorWrapperRangePolicyParallelFor<Functor, Policy> f{policy.begin(), functor}; sycl::range<1> range(policy.end() - policy.begin()); cgh.parallel_for<FunctorWrapperRangePolicyParallelFor<Functor, Policy>>( range, f); +#endif } else { // Use the chunk size as workgroup size. We need to make sure that the // range the kernel is launched with is a multiple of the workgroup @@ -109,12 +136,22 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, FunctorWrapperRangePolicyParallelForCustom<Functor, Policy>>(range, f); } - }); -#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event}); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + return {}; + } else #endif + { + auto parallel_for_event = q.submit(cgh_lambda); - return parallel_for_event; +#ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES + q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event}); +#endif + return parallel_for_event; + } } public: @@ -123,24 +160,18 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, void execute() const { if (m_policy.begin() == m_policy.end()) return; - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_policy.space() + .impl_internal_space_instance() + ->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); } - ParallelFor(const ParallelFor&) = delete; - ParallelFor(ParallelFor&&) = delete; - ParallelFor& operator=(const ParallelFor&) = delete; - ParallelFor& operator=(ParallelFor&&) = delete; - ~ParallelFor() = default; - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} }; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 4fc5818ce9bdcdd5092473eb5339a428ee4754c8..d8859cda9f3e064654db4c6c0832f561323caca8 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -22,15 +22,16 @@ #include <SYCL/Kokkos_SYCL_Team.hpp> #include <SYCL/Kokkos_SYCL_TeamPolicy.hpp> +#include <sstream> #include <vector> template <typename FunctorType, typename... Properties> class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: - using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>; + using Policy = TeamPolicy<Properties...>; using functor_type = FunctorType; - using size_type = ::Kokkos::Experimental::SYCL::size_type; + using size_type = ::Kokkos::SYCL::size_type; private: using member_type = typename Policy::member_type; @@ -44,22 +45,19 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, size_type const m_vector_size; int m_shmem_begin; int m_shmem_size; - sycl::device_ptr<char> m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor/Reduce modify the team scratch memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock<std::mutex> m_scratch_lock; - int m_scratch_pool_id = -1; template <typename FunctorWrapper> - sycl::event sycl_direct_launch(const Policy& policy, + sycl::event sycl_direct_launch(const sycl_device_ptr<char> global_scratch_ptr, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); - auto parallel_for_event = q.submit([&](sycl::handler& cgh) { + desul::ensure_sycl_lock_arrays_on_device(q); + + auto cgh_lambda = [&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues sycl::local_accessor<char, 1> team_scratch_memory_L0( @@ -70,11 +68,11 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr; auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), shmem_begin, + scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, item.get_group_linear_id(), item.get_group_range(1)); @@ -111,28 +109,53 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, sycl::range<2>(m_team_size, m_league_size * final_vector_size), sycl::range<2>(m_team_size, final_vector_size)), lambda); - }); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + return {}; + } else +#endif + { + auto parallel_for_event = q.submit(cgh_lambda); + #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event}); + q.ext_oneapi_submit_barrier(std::vector<sycl::event>{parallel_for_event}); #endif - return parallel_for_event; + return parallel_for_event; + } } public: inline void execute() const { if (m_league_size == 0) return; - auto& space = *m_policy.space().impl_internal_space_instance(); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = space.get_indirect_kernel_mem(); + auto& instance = *m_policy.space().impl_internal_space_instance(); + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock<std::mutex> team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr<char> global_scratch_ptr = + static_cast<sycl_device_ptr<char>>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size)); + + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); - sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, + sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); - space.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -140,14 +163,15 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - // FIXME_SYCL optimize - if (m_team_size < 0) + m_vector_size(arg_policy.impl_vector_length()) { + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended(arg_functor, ParallelForTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor<SYCL, TeamPolicy> could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -156,22 +180,14 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size)); - - if (static_cast<int>(space.m_maxShmemPerBlock) < + const auto& instance = *m_policy.space().impl_internal_space_instance(); + if (static_cast<int>(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 6964c2dbcf0daf087275969f90b5a8037b78cf49..1e313549757bf037ad2fbd75658c1c7b2b2d15f7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -30,7 +30,7 @@ template <class CombinedFunctorReducerType, class... Traits> class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, Kokkos::MDRangePolicy<Traits...>, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy<Traits...>; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -76,10 +76,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, m_space(p.space()), m_result_ptr(v.data()), m_result_ptr_device_accessible( - MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - typename View::memory_space>::accessible), - m_shared_memory_lock( - m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} + MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + typename View::memory_space>::accessible) {} private: template <typename CombinedFunctorReducerWrapper> @@ -87,30 +85,39 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - Kokkos::Experimental::Impl::SYCLInternal& instance = + Kokkos::Impl::SYCLInternal& instance = *m_space.impl_internal_space_instance(); sycl::queue& q = m_space.sycl_queue(); const typename Policy::index_type n_tiles = m_policy.m_num_tiles; const unsigned int value_count = m_functor_reducer.get_reducer().value_count(); - sycl::device_ptr<value_type> results_ptr; + sycl_device_ptr<value_type> results_ptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast<sycl_host_ptr<value_type>>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If n_tiles==0 we only call init() and final() working with the global // scratch memory but don't copy back to m_result_ptr yet. if (n_tiles == 0) { - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); #else (void)memcpy_event; #endif - results_ptr = static_cast<sycl::device_ptr<value_type>>( + results_ptr = static_cast<sycl_device_ptr<value_type>>( instance.scratch_space(sizeof(value_type) * value_count)); - sycl::global_ptr<value_type> device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr) + : static_cast<sycl::global_ptr<value_type>>(host_result_ptr); cgh.single_task([=]() { const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -120,12 +127,20 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, if (device_accessible_result_ptr) reducer.copy(device_accessible_result_ptr.get(), results_ptr.get()); }); - }); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector<sycl::event>{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector<sycl::event>{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } else { // Otherwise (when n_tiles is not zero), we perform a reduction on the // values in all workgroups separately, write the workgroup results back @@ -146,14 +161,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, n_wgroups = (n_tiles + values_per_thread - 1) / values_per_thread; } - results_ptr = static_cast<sycl::device_ptr<value_type>>( + results_ptr = static_cast<sycl_device_ptr<value_type>>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); - sycl::global_ptr<value_type> device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; - auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>( + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr) + : static_cast<sycl::global_ptr<value_type>>(host_result_ptr); + auto scratch_flags = static_cast<sycl_device_ptr<unsigned int>>( instance.scratch_flags(sizeof(unsigned int))); - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { sycl::local_accessor<value_type> local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); sycl::local_accessor<unsigned int> num_teams_done(1, cgh); @@ -223,6 +240,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= static_cast<int>(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { @@ -268,6 +286,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= static_cast<int>(n_wgroups)) reducer.init(&local_value); else { @@ -285,22 +304,36 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } } }); - }); + }; +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector<sycl::event>{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector<sycl::event>{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } // At this point, the reduced value is written to the entry in results_ptr // and all that is left is to copy it back to the given result pointer if // necessary. - if (m_result_ptr && !m_result_ptr_device_accessible) { - Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>( - m_space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + if constexpr (Policy::is_graph_kernel::value) + Kokkos::abort( + "parallel_reduce not implemented for graph kernels if result is " + "not device-accessible!"); + + m_space.fence( + "Kokkos::Impl::ParallelReduce<SYCL, MDRangePolicy>::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(value_type) * value_count); } return last_reduction_event; @@ -313,15 +346,19 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal& instance = + Kokkos::Impl::SYCLInternal& instance = *m_space.impl_internal_space_instance(); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock<std::mutex> scratch_buffers_lock( + instance.m_mutexScratchSpace); + + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); @@ -331,13 +368,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, private: const CombinedFunctorReducerType m_functor_reducer; const BarePolicy m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock<std::mutex> m_shared_memory_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 8c900cfa42806716d857c19b7beb49fa70c83d5a..77a4bfb12ff1e3e0de18cf90de53d5439ca0348b 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -25,9 +25,8 @@ #include <vector> template <class CombinedFunctorReducerType, class... Traits> -class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, - Kokkos::RangePolicy<Traits...>, - Kokkos::Experimental::SYCL> { +class Kokkos::Impl::ParallelReduce< + CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy<Traits...>; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -49,10 +48,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, m_policy(p), m_result_ptr(v.data()), m_result_ptr_device_accessible( - MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - typename View::memory_space>::accessible), - m_shared_memory_lock( - p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} + MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + typename View::memory_space>::accessible) {} private: template <typename PolicyType, typename CombinedFunctorReducerWrapper> @@ -61,28 +58,37 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); std::size_t size = policy.end() - policy.begin(); const unsigned int value_count = m_functor_reducer.get_reducer().value_count(); - sycl::device_ptr<value_type> results_ptr = nullptr; - sycl::global_ptr<value_type> device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + sycl_device_ptr<value_type> results_ptr = nullptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast<sycl_host_ptr<value_type>>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr) + : static_cast<sycl::global_ptr<value_type>>(host_result_ptr); sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. if (size <= 1) { - results_ptr = static_cast<sycl::device_ptr<value_type>>( + results_ptr = static_cast<sycl_device_ptr<value_type>>( instance.scratch_space(sizeof(value_type) * value_count)); - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { const auto begin = policy.begin(); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -105,24 +111,32 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, if (device_accessible_result_ptr != nullptr) reducer.copy(device_accessible_result_ptr.get(), results_ptr.get()); }); - }); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector<sycl::event>{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector<sycl::event>{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } else { // Otherwise (when size > 1), we perform a reduction on the values in all // workgroups separately, write the workgroup results back to global // memory and recurse until only one workgroup does the reduction and thus // gets the final value. - auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>( + auto scratch_flags = static_cast<sycl_device_ptr<unsigned int>>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor<value_type> local_mem, sycl::local_accessor<unsigned int> num_teams_done, - sycl::device_ptr<value_type> results_ptr, int values_per_thread) { + sycl_device_ptr<value_type> results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -168,6 +182,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -210,6 +225,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_value); else { @@ -230,7 +246,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, return lambda; }; - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { sycl::local_accessor<unsigned int> num_teams_done(1, cgh); auto dummy_reduction_lambda = @@ -291,7 +307,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, } results_ptr = - static_cast<sycl::device_ptr<value_type>>(instance.scratch_space( + static_cast<sycl_device_ptr<value_type>>(instance.scratch_space( sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor<value_type> local_mem( @@ -309,22 +325,37 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), reduction_lambda); - }); + }; + +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector<sycl::event>{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector<sycl::event>{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } // At this point, the reduced value is written to the entry in results_ptr // and all that is left is to copy it back to the given result pointer if // necessary. - if (m_result_ptr && !m_result_ptr_device_accessible) { - Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + if constexpr (Policy::is_graph_kernel::value) + Kokkos::abort( + "parallel_reduce not implemented for graph kernels if result is " + "not device-accessible!"); + + space.fence( + "Kokkos::Impl::ParallelReduce<SYCL, RangePolicy>::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -332,15 +363,19 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, public: void execute() const { - Kokkos::Experimental::Impl::SYCLInternal& instance = + Kokkos::Impl::SYCLInternal& instance = *m_policy.space().impl_internal_space_instance(); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock<std::mutex> scratch_buffers_lock( + instance.m_mutexScratchSpace); + + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_reducer_wrapper, @@ -353,10 +388,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, const Policy m_policy; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock<std::mutex> m_shared_memory_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 07145b0fb93c4c76ad91129b3e96d24d8ed7833b..8f5310cbb21c2597faff4b53c20a12f75a733f85 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -23,14 +23,15 @@ #include <SYCL/Kokkos_SYCL_TeamPolicy.hpp> #include <SYCL/Kokkos_SYCL_WorkgroupReduction.hpp> +#include <sstream> #include <vector> template <class CombinedFunctorReducerType, class... Properties> class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, Kokkos::TeamPolicy<Properties...>, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: - using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>; + using Policy = TeamPolicy<Properties...>; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; @@ -45,7 +46,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, public: using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; private: const CombinedFunctorReducerType m_functor_reducer; @@ -54,24 +55,19 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, const bool m_result_ptr_device_accessible; size_type m_shmem_begin; size_type m_shmem_size; - sycl::device_ptr<char> m_global_scratch_ptr; size_t m_scratch_size[2]; const size_type m_league_size; int m_team_size; const size_type m_vector_size; - // Only let one ParallelFor/Reduce modify the team scratch memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock<std::mutex> m_scratch_lock; - int m_scratch_pool_id = -1; - template <typename PolicyType, typename CombinedFunctorReducerWrapper> + template <typename CombinedFunctorReducerWrapper> sycl::event sycl_direct_launch( - const PolicyType& policy, + const sycl_device_ptr<char> global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -79,20 +75,29 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, m_functor_reducer.get_reducer().value_count(); std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size; value_type* results_ptr = nullptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast<sycl_host_ptr<value_type>>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. if (size <= 1) { results_ptr = - static_cast<sycl::device_ptr<value_type>>(instance.scratch_space( + static_cast<sycl_device_ptr<value_type>>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); - sycl::global_ptr<value_type> device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr) + : static_cast<sycl::global_ptr<value_type>>(host_result_ptr); - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { + auto cgh_lambda = [&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues sycl::local_accessor<char, 1> team_scratch_memory_L0( @@ -103,7 +108,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); @@ -121,9 +125,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, reference_type update = reducer.init(results_ptr); if (size == 1) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, - scratch_size[0], global_scratch_ptr, scratch_size[1], item, - item.get_group_linear_id(), item.get_group_range(1)); + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), + shmem_begin, scratch_size[0], global_scratch_ptr, + scratch_size[1], item, item.get_group_linear_id(), + item.get_group_range(1)); if constexpr (std::is_void_v<WorkTag>) functor(team_member, update); else @@ -133,19 +138,26 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, if (device_accessible_result_ptr) reducer.copy(device_accessible_result_ptr, &results_ptr[0]); }); - }); + }; +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector<sycl::event>{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector<sycl::event>{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } else { // Otherwise, (if the total range has more than one element) we perform a // reduction on the values in all workgroups separately, write the // workgroup results back to global memory and recurse until only one // workgroup does the reduction and thus gets the final value. - auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>( + auto cgh_lambda = [&](sycl::handler& cgh) { + auto scratch_flags = static_cast<sycl_device_ptr<unsigned int>>( instance.scratch_flags(sizeof(unsigned int))); // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -159,13 +171,16 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, const auto shmem_begin = m_shmem_begin; const auto league_size = m_league_size; const size_t scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; - sycl::device_ptr<char> const global_scratch_ptr = m_global_scratch_ptr; + sycl::local_accessor<unsigned int> num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor<value_type, 1> local_mem, - sycl::device_ptr<value_type> results_ptr) { - sycl::global_ptr<value_type> device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + sycl_device_ptr<value_type> results_ptr) { + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr) + : static_cast<sycl::global_ptr<value_type>>( + host_result_ptr); auto lambda = [=](sycl::nd_item<2> item) { auto n_wgroups = item.get_group_range()[1]; int wgroup_size = @@ -173,8 +188,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, auto group_id = item.get_group_linear_id(); auto size = n_wgroups * wgroup_size; - auto& num_teams_done = reinterpret_cast<unsigned int&>( - local_mem[wgroup_size * std::max(value_count, 1u)]); const auto local_id = item.get_local_linear_id(); const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -188,8 +201,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, for (int league_rank = group_id; league_rank < league_size; league_rank += n_wgroups) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, - scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), + shmem_begin, scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, league_rank, league_size); @@ -212,10 +225,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, sycl::memory_scope::device, sycl::access::address_space::global_space> scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } sycl::group_barrier(item.get_group()); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -241,8 +255,8 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, for (int league_rank = group_id; league_rank < league_size; league_rank += n_wgroups) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, - scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), + shmem_begin, scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, league_rank, league_size); @@ -264,10 +278,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, sycl::memory_scope::device, sycl::access::address_space::global_space> scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_value); else { @@ -311,15 +326,12 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, auto wgroup_size = m_team_size * final_vector_size; std::size_t size = std::size_t(m_league_size) * wgroup_size; sycl::local_accessor<value_type, 1> local_mem( - sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); const auto init_size = std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast<sycl::device_ptr<value_type>>(instance.scratch_space( + static_cast<sycl_device_ptr<value_type>>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = @@ -347,22 +359,36 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, sycl::range<2>(m_team_size, n_wgroups * m_vector_size), sycl::range<2>(m_team_size, m_vector_size)), reduction_lambda); - }); + }; +#ifdef SYCL_EXT_ONEAPI_GRAPH + if constexpr (Policy::is_graph_kernel::value) { + sycl_attach_kernel_to_node(*this, cgh_lambda); + } else +#endif + { + last_reduction_event = q.submit(cgh_lambda); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - q.ext_oneapi_submit_barrier( - std::vector<sycl::event>{parallel_reduce_event}); + q.ext_oneapi_submit_barrier( + std::vector<sycl::event>{last_reduction_event}); #endif - last_reduction_event = parallel_reduce_event; + } } // At this point, the reduced value is written to the entry in results_ptr // and all that is left is to copy it back to the given result pointer if // necessary. - if (m_result_ptr && !m_result_ptr_device_accessible) { - Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + if constexpr (Policy::is_graph_kernel::value) + Kokkos::abort( + "parallel_reduce not implemented for graph kernels if result is " + "not device-accessible!"); + + space.fence( + "Kokkos::Impl::ParallelReduce<SYCL, TeamPolicy>::execute: result not " + "device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -370,30 +396,59 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, public: inline void execute() { - Kokkos::Experimental::Impl::SYCLInternal& instance = + Kokkos::Impl::SYCLInternal& instance = *m_policy.space().impl_internal_space_instance(); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock<std::mutex> scratch_buffers_lock( + instance.m_mutexScratchSpace); + std::scoped_lock<std::mutex> team_scratch_lock( + instance.m_team_scratch_mutex); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + int scratch_pool_id = instance.acquire_team_scratch_space(); + const sycl_device_ptr<char> global_scratch_ptr = + static_cast<sycl_device_ptr<char>>(instance.resize_team_scratch_space( + scratch_pool_id, + static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size)); + + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = - sycl_direct_launch(m_policy, functor_reducer_wrapper, + sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); functor_reducer_wrapper.register_event(event); - instance.register_team_scratch_event(m_scratch_pool_id, event); + instance.register_team_scratch_event(scratch_pool_id, event); } - private: - void initialize() { - // FIXME_SYCL optimize - if (m_team_size < 0) + template <class ViewType> + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + typename ViewType::memory_space>::accessible), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), ParallelReduceTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce<SYCL, TeamPolicy> could not find a " + "valid execution configuration."); + } + // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -409,22 +464,15 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, m_scratch_size[0] = m_shmem_size; m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - auto& space = *m_policy.space().impl_internal_space_instance(); - m_scratch_pool_id = space.acquire_team_scratch_space(); - m_global_scratch_ptr = - static_cast<sycl::device_ptr<char>>(space.resize_team_scratch_space( - m_scratch_pool_id, - static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size)); - - if (static_cast<int>(space.m_maxShmemPerBlock) < + const Kokkos::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + if (static_cast<int>(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { std::stringstream out; out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! " "Requested " << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << space.m_maxShmemPerBlock << '\n'; + << instance.m_maxShmemPerBlock << '\n'; Kokkos::Impl::throw_runtime_exception(out.str()); } @@ -434,25 +482,6 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType, Kokkos::Impl::throw_runtime_exception( "Kokkos::Impl::ParallelFor<SYCL> requested too large team size."); } - - public: - template <class ViewType> - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - typename ViewType::memory_space>::accessible), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { - initialize(); - } }; #endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 04425723e1987586277ca6322e2066653e97c250..ed7cee2805d94dbea114226385ee969f7ec4cd9a 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -14,10 +14,11 @@ // //@HEADER -#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP -#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP +#ifndef KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP #include <Kokkos_Macros.hpp> +#include <SYCL/Kokkos_SYCL_WorkgroupReduction.hpp> #include <memory> #include <vector> @@ -35,20 +36,38 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& final_reducer, auto sg = item.get_sub_group(); const int sg_group_id = sg.get_group_id()[0]; const int id_in_sg = sg.get_local_id()[0]; - - for (int stride = 1; stride < global_range; stride <<= 1) { - auto tmp = sg.shuffle_up(local_value, stride); + const int local_range = std::min<int>(sg.get_local_range()[0], global_range); + +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < local_range) { + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right(sg, local_value, + stride); + if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(local_range <= 32); +#else + for (int stride = 1; stride < local_range; stride <<= 1) { + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, local_value, stride); if (id_in_sg >= stride) final_reducer.join(&local_value, &tmp); } +#endif const int max_subgroup_size = sg.get_max_local_range()[0]; const int n_active_subgroups = (global_range + max_subgroup_size - 1) / max_subgroup_size; - const int local_range = sg.get_local_range()[0]; if (id_in_sg == local_range - 1 && sg_group_id < n_active_subgroups) local_mem[sg_group_id] = local_value; - local_value = sg.shuffle_up(local_value, 1); + local_value = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, local_value, 1); if (id_in_sg == 0) final_reducer.init(&local_value); sycl::group_barrier(item.get_group()); @@ -61,8 +80,29 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& final_reducer, const auto upper_bound = std::min(local_range, n_active_subgroups - round * local_range); auto local_sg_value = local_mem[idx < n_active_subgroups ? idx : 0]; +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < upper_bound) { + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, local_sg_value, stride); + if (id_in_sg >= stride) { + if (idx < n_active_subgroups) + final_reducer.join(&local_sg_value, &tmp); + else + local_sg_value = tmp; + } + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(upper_bound <= 32); +#else for (int stride = 1; stride < upper_bound; stride <<= 1) { - auto tmp = sg.shuffle_up(local_sg_value, stride); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, local_sg_value, stride); if (id_in_sg >= stride) { if (idx < n_active_subgroups) final_reducer.join(&local_sg_value, &tmp); @@ -70,6 +110,7 @@ void workgroup_scan(sycl::nd_item<dim> item, const FunctorType& final_reducer, local_sg_value = tmp; } } +#endif if (idx < n_active_subgroups) { local_mem[idx] = local_sg_value; if (round > 0) @@ -104,121 +145,118 @@ class ParallelScanSYCLBase { using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; using index_type = typename Policy::index_type; protected: const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer> m_functor_reducer; const Policy m_policy; - pointer_type m_scratch_space = nullptr; - const pointer_type m_result_ptr; + sycl_host_ptr<value_type> m_scratch_host = nullptr; + pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock<std::mutex> m_shared_memory_lock; - private: template <typename FunctorWrapper> sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, sycl::event memcpy_event) { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>( + auto scratch_flags = static_cast<sycl_device_ptr<unsigned int>>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = - [&](sycl::local_accessor<value_type> local_mem, - sycl::local_accessor<unsigned int> num_teams_done, - sycl::device_ptr<value_type> global_mem_, - sycl::device_ptr<value_type> group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer< - FunctorType, typename Analysis::Reducer>& functor_reducer = - functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void<WorkTag>::value) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = [&](sycl::local_accessor<value_type> local_mem, + sycl::local_accessor<unsigned int> + num_teams_done, + sycl_device_ptr<value_type> global_mem_, + sycl_device_ptr<value_type> group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; + + const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>& + functor_reducer = functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); + + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; + + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); + + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void<WorkTag>::value) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); + } - workgroup_scan<>(item, reducer, local_mem, local_value, - wgroup_size); + workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; - sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel, - sycl::memory_scope::device, - sycl::access::address_space::global_space> - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - item.barrier(sycl::access::fence_space::global_space); - if (num_teams_done[0] == n_wgroups) { - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast<index_type>(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min<index_type>(n_wgroups - offset, wgroup_size)); - if (id < static_cast<index_type>(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; - } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - item.barrier(sycl::access::fence_space::global_space); - } + sycl::atomic_ref<unsigned, sycl::memory_order::acq_rel, + sycl::memory_scope::device, + sycl::access::address_space::global_space> + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + item.barrier(sycl::access::fence_space::global_space); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast<index_type>(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min<index_type>(n_wgroups - offset, wgroup_size)); + if (id < static_cast<index_type>(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; } - }; - return lambda; - }; + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + item.barrier(sycl::access::fence_space::global_space); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - sycl::device_ptr<value_type> global_mem; - sycl::device_ptr<value_type> group_results; + sycl_device_ptr<value_type> global_mem; + sycl_device_ptr<value_type> group_results; + + desul::ensure_sycl_lock_arrays_on_device(q); auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { sycl::local_accessor<unsigned int> num_teams_done(1, cgh); @@ -251,9 +289,10 @@ class ParallelScanSYCLBase { // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass global_mem = - static_cast<sycl::device_ptr<value_type>>(instance.scratch_space( + static_cast<sycl_device_ptr<value_type>>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_space = global_mem; + m_scratch_host = static_cast<sycl_host_ptr<value_type>>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -281,10 +320,11 @@ class ParallelScanSYCLBase { // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { - auto result_ptr_device_accessible = m_result_ptr_device_accessible; // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. - auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; + pointer_type result_ptr = m_result_ptr_device_accessible + ? m_result_ptr + : static_cast<pointer_type>(m_scratch_host); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(perform_work_group_scans); @@ -293,7 +333,6 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - auto global_mem_copy = global_mem; const index_type global_id = item.get_global_linear_id(); const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = @@ -312,9 +351,7 @@ class ParallelScanSYCLBase { else functor(WorkTag(), global_id + begin, update, true); - global_mem_copy[global_id] = update; - if (global_id == size - 1 && result_ptr_device_accessible) - *result_ptr = update; + if (global_id == size - 1) *result_ptr = update; } }); }); @@ -332,11 +369,16 @@ class ParallelScanSYCLBase { auto& instance = *m_policy.space().impl_internal_space_instance(); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + // Only let one instance at a time resize the instance's scratch memory + // allocations. + std::scoped_lock<std::mutex> scratch_buffers_lock( + instance.m_mutexScratchSpace); + + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -350,17 +392,14 @@ class ParallelScanSYCLBase { : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), m_policy(arg_policy), m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_shared_memory_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} }; } // namespace Kokkos::Impl template <class FunctorType, class... Traits> class Kokkos::Impl::ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, - Kokkos::Experimental::SYCL> + Kokkos::SYCL> : private ParallelScanSYCLBase<FunctorType, void, Traits...> { public: using Base = ParallelScanSYCLBase<FunctorType, void, Traits...>; @@ -378,23 +417,24 @@ class Kokkos::Impl::ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, template <class FunctorType, class ReturnType, class... Traits> class Kokkos::Impl::ParallelScanWithTotal< - FunctorType, Kokkos::RangePolicy<Traits...>, ReturnType, - Kokkos::Experimental::SYCL> + FunctorType, Kokkos::RangePolicy<Traits...>, ReturnType, Kokkos::SYCL> : public ParallelScanSYCLBase<FunctorType, ReturnType, Traits...> { public: using Base = ParallelScanSYCLBase<FunctorType, ReturnType, Traits...>; - const Kokkos::Experimental::SYCL& m_exec; + const Kokkos::SYCL& m_exec; inline void execute() { Base::impl_execute([&]() { const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork > 0 && !Base::m_result_ptr_device_accessible) { + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x + // slower. + m_exec.fence( + "Kokkos::Impl::ParallelReduce<SYCL, MDRangePolicy>::execute: " + "result not device-accessible"); const int size = Base::m_functor_reducer.get_reducer().value_size(); - DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCL>(m_exec, Base::m_result_ptr, - Base::m_scratch_space + nwork - 1, - size); + std::memcpy(Base::m_result_ptr, Base::m_scratch_host, size); } }); } @@ -404,7 +444,7 @@ class Kokkos::Impl::ParallelScanWithTotal< const typename Base::Policy& arg_policy, const ViewType& arg_result_view) : Base(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess<Experimental::SYCLDeviceUSMSpace, + MemorySpaceAccess<SYCLDeviceUSMSpace, typename ViewType::memory_space>::accessible), m_exec(arg_policy.space()) {} }; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 64b7f56796a527a73546f248030f43bf73541fc6..022f88e0a812e7c0a05fd56020316ebbe1e37065 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -25,7 +25,6 @@ #include <SYCL/Kokkos_SYCL_Space.hpp> #include <SYCL/Kokkos_SYCL_DeepCopy.hpp> #include <SYCL/Kokkos_SYCL_Instance.hpp> -#include <impl/Kokkos_MemorySpace.hpp> #include <impl/Kokkos_Profiling.hpp> /*--------------------------------------------------------------------------*/ @@ -34,11 +33,11 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); } -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; auto event = q.memcpy(dst, src, n); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -47,9 +46,8 @@ void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, } void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); - Experimental::SYCL().fence( - "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + SYCL().fence("Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); } } // namespace Impl @@ -57,9 +55,22 @@ void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +namespace { + +std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { + switch (allocation_kind) { + case sycl::usm::alloc::host: return Kokkos::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: return Kokkos::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: return Kokkos::SYCLSharedUSMSpace::name(); + default: + Kokkos::abort("bug: unknown sycl allocation type"); + return "unreachable"; + } +} + +} // namespace namespace Kokkos { -namespace Experimental { SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} @@ -76,17 +87,17 @@ SYCLHostUSMSpace::SYCLHostUSMSpace() SYCLHostUSMSpace::SYCLHostUSMSpace(sycl::queue queue) : m_queue(std::move(queue)) {} -void* allocate_sycl( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - const RawMemoryAllocationFailure::AllocationMechanism failure_tag, - const sycl::usm::alloc allocation_kind, const sycl::queue& queue) { +void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle, + const sycl::usm::alloc allocation_kind, + const sycl::queue& queue) { void* const hostPtr = sycl::malloc(arg_alloc_size, queue, allocation_kind); - if (hostPtr == nullptr) - throw RawMemoryAllocationFailure( - arg_alloc_size, 1, RawMemoryAllocationFailure::FailureMode::Unknown, - failure_tag); + if (hostPtr == nullptr) { + Kokkos::Impl::throw_bad_alloc(get_memory_space_name(allocation_kind), + arg_alloc_size, arg_label); + } if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = @@ -98,21 +109,19 @@ void* allocate_sycl( return hostPtr; } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const size_t arg_alloc_size) const { return allocate(exec_space, "[unlabeled]", arg_alloc_size); } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice, - sycl::usm::alloc::device, - *exec_space.impl_internal_space_instance()->m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::device, + *exec_space.impl_internal_space_instance()->m_queue); } void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const { @@ -122,11 +131,9 @@ void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const { void* SYCLDeviceUSMSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice, - sycl::usm::alloc::device, m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::device, m_queue); } void* SYCLSharedUSMSpace::allocate(const SYCL& exec_space, @@ -137,12 +144,10 @@ void* SYCLSharedUSMSpace::allocate(const SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared, - sycl::usm::alloc::shared, - *exec_space.impl_internal_space_instance()->m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::shared, + *exec_space.impl_internal_space_instance()->m_queue); } void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const { @@ -151,11 +156,9 @@ void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const { void* SYCLSharedUSMSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared, - sycl::usm::alloc::shared, m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::shared, m_queue); } void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, @@ -165,12 +168,10 @@ void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, void* SYCLHostUSMSpace::allocate(const SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost, - sycl::usm::alloc::host, - *exec_space.impl_internal_space_instance()->m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::host, + *exec_space.impl_internal_space_instance()->m_queue); } void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const { @@ -179,11 +180,9 @@ void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const { void* SYCLHostUSMSpace::allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { - return allocate_sycl( - arg_label, arg_alloc_size, arg_logical_size, - Kokkos::Tools::make_space_handle(name()), - RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost, - sycl::usm::alloc::host, m_queue); + return allocate_sycl(arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + sycl::usm::alloc::host, m_queue); } void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr, @@ -240,205 +239,6 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, Kokkos::Tools::make_space_handle(name()), m_queue); } -} // namespace Experimental -} // namespace Kokkos - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord<void, void> SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record; - -SharedAllocationRecord<void, void> SharedAllocationRecord< - Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record; - -SharedAllocationRecord<void, void> SharedAllocationRecord< - Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record; -#endif - -SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord<void, void>::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, - void>::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(space, label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Experimental::SYCL exec; - Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, " - "void>::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& arg_exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord<void, void>::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, - void>::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space, - label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, - void>::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, - void>::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, - void>::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl } // namespace Kokkos //============================================================================== @@ -446,23 +246,12 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, #include <impl/Kokkos_SharedAlloc_timpl.hpp> -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::SYCLHostUSMSpace); // </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0b40311cb1d486051f61c9641b771ac..5a37da130caffa701f613d5a51e263b0546e1691 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -39,8 +39,6 @@ template <typename T> struct is_sycl_type_space : public std::false_type {}; } // namespace Impl -namespace Experimental { - class SYCLDeviceUSMSpace { public: using execution_space = SYCL; @@ -66,11 +64,6 @@ class SYCLDeviceUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template <class, class, class, class> - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLDeviceUSM"; }; private: @@ -87,6 +80,16 @@ class SYCLSharedUSMSpace { SYCLSharedUSMSpace(); explicit SYCLSharedUSMSpace(sycl::queue queue); + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -102,11 +105,6 @@ class SYCLSharedUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template <class, class, class, class> - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLSharedUSM"; }; private: @@ -123,6 +121,16 @@ class SYCLHostUSMSpace { SYCLHostUSMSpace(); explicit SYCLHostUSMSpace(sycl::queue queue); + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template <typename ExecutionSpace> + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -138,59 +146,46 @@ class SYCLHostUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template <class, class, class, class> - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLHostUSM"; }; private: sycl::queue m_queue; }; -} // namespace Experimental - namespace Impl { template <> -struct is_sycl_type_space<Kokkos::Experimental::SYCLDeviceUSMSpace> - : public std::true_type {}; +struct is_sycl_type_space<Kokkos::SYCLDeviceUSMSpace> : public std::true_type { +}; template <> -struct is_sycl_type_space<Kokkos::Experimental::SYCLSharedUSMSpace> - : public std::true_type {}; +struct is_sycl_type_space<Kokkos::SYCLSharedUSMSpace> : public std::true_type { +}; template <> -struct is_sycl_type_space<Kokkos::Experimental::SYCLHostUSMSpace> - : public std::true_type {}; +struct is_sycl_type_space<Kokkos::SYCLHostUSMSpace> : public std::true_type {}; -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); +static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); +static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLSharedUSMSpace>::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); +static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::assignable); template <> -struct MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace> { +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::SYCLDeviceUSMSpace> { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace> { +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::SYCLSharedUSMSpace> { // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space enum : bool { assignable = false }; enum : bool { accessible = true }; @@ -198,26 +193,24 @@ struct MemorySpaceAccess<Kokkos::HostSpace, }; template <> -struct MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace> { +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::SYCLHostUSMSpace> { // HostSpace::execution_space == - // Experimental::SYCLHostUSMSpace::execution_space + // SYCLHostUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::HostSpace> { +struct MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, Kokkos::HostSpace> { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace> { +struct MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLSharedUSMSpace> { // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; @@ -225,14 +218,11 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace> { - // Experimental::SYCLDeviceUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, Kokkos::SYCLHostUSMSpace> { + // SYCLDeviceUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLDeviceUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLDeviceUSMSpace::execution_space enum : bool { deepcopy = true }; }; @@ -241,16 +231,15 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, // SYCLSharedUSMSpace accessible to both SYCL and Host template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::HostSpace> { +struct MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, Kokkos::HostSpace> { enum : bool { assignable = false }; enum : bool { accessible = false }; // SYCL cannot access HostSpace enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace> { +struct MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLDeviceUSMSpace> { // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space // Can access SYCLSharedUSMSpace from Host but cannot access // SYCLDeviceUSMSpace from Host @@ -262,47 +251,38 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace> { - // Experimental::SYCLSharedUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, Kokkos::SYCLHostUSMSpace> { + // SYCLSharedUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLSharedUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLSharedUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::HostSpace> { +struct MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, Kokkos::HostSpace> { enum : bool { assignable = false }; // Cannot access from SYCL - enum : bool { - accessible = true - }; // Experimental::SYCLHostUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLHostUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace> { +struct MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, Kokkos::SYCLDeviceUSMSpace> { enum : bool { assignable = false }; // Cannot access from Host enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace> { +struct MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, Kokkos::SYCLSharedUSMSpace> { enum : bool { assignable = false }; // different execution_space enum : bool { accessible = true }; // same accessibility enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> { +struct MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::ScratchMemorySpace<Kokkos::SYCL>> { enum : bool { assignable = false }; enum : bool { accessible = true }; enum : bool { deepcopy = false }; @@ -310,151 +290,12 @@ struct MemorySpaceAccess< } // namespace Impl -namespace Impl { - -template <> -class SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void> - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::SYCLDeviceUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void> - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; - using base_t = - SharedAllocationRecordCommon<Kokkos::Experimental::SYCLSharedUSMSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLSharedUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void> - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - using base_t = - SharedAllocationRecordCommon<Kokkos::Experimental::SYCLHostUSMSpace>; - using RecordBase = SharedAllocationRecord<void, void>; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLHostUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template <typename ExecutionSpace> - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl - } // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLHostUSMSpace); + #endif #endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 89c09c3195feeb8c7c6d6ba7cce353ab7a05c5b6..6359e4a2d9e763907aa55fdb40c5e280b9ad9867 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -22,6 +22,7 @@ #ifdef KOKKOS_ENABLE_SYCL #include <utility> +#include <SYCL/Kokkos_SYCL_WorkgroupReduction.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -33,7 +34,7 @@ namespace Impl { */ class SYCLTeamMember { public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; using scratch_memory_space = execution_space::scratch_memory_space; using team_handle = SYCLTeamMember; @@ -125,6 +126,20 @@ class SYCLTeamMember { team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<SYCL>, ReducerType, + value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<WrappedReducerType>::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + using value_type = typename WrappedReducerType::value_type; auto sg = m_item.get_sub_group(); const auto sub_group_range = sg.get_local_range()[0]; @@ -133,67 +148,69 @@ class SYCLTeamMember { const unsigned int team_rank_ = team_rank(); // First combine the values in the same subgroup +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int shift) { + if (vector_range * shift < sub_group_range) { + const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( + sg, value, vector_range * shift); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(sub_group_range <= 32); +#else for (unsigned int shift = 1; vector_range * shift < sub_group_range; shift <<= 1) { - const value_type tmp = sg.shuffle_down(value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( + sg, value, vector_range * shift); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } - value = sg.shuffle(value, 0); +#endif + value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); - // We need to chunk up the whole reduction because we might not have - // allocated enough memory. - const auto n_subgroups = sg.get_group_range()[0]; - const unsigned int maximum_work_range = - std::min<int>(m_team_reduce_size / sizeof(value_type), n_subgroups); + const int n_subgroups = sg.get_group_range()[0]; + if (n_subgroups == 1) { + return; + } + + // It was found experimentally that 16 is a good value for Intel PVC. + // Since there is a maximum number of 1024 threads with subgroup size 16, + // we have a maximum of 64 subgroups per workgroup which means 64/16=4 + // rounds for loading values into the reduction_array, and 16 redundant + // reduction steps executed by every thread. + constexpr int step_width = 16; + auto tmp_alloc = sycl::ext::oneapi::group_local_memory_for_overwrite< + value_type[step_width]>(m_item.get_group()); + auto& reduction_array = *tmp_alloc; const auto id_in_sg = sg.get_local_id()[0]; - auto reduction_array = - static_cast<sycl::local_ptr<value_type>>(m_team_reduce); - // Load values into the first maximum_work_range values of the reduction + // Load values into the first step_width values of the reduction // array in chunks. This means that only sub groups with an id in the // corresponding chunk load values. - const auto group_id = sg.get_group_id()[0]; - if (id_in_sg == 0 && group_id < maximum_work_range) + const int group_id = sg.get_group_id()[0]; + if (id_in_sg == 0 && group_id < step_width) reduction_array[group_id] = value; sycl::group_barrier(m_item.get_group()); - for (unsigned int start = maximum_work_range; start < n_subgroups; - start += maximum_work_range) { + for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && - group_id < - std::min<unsigned int>(start + maximum_work_range, n_subgroups)) - reducer.join(reduction_array[group_id - start], value); + group_id < std::min(start + step_width, n_subgroups)) + wrapped_reducer.join(&reduction_array[group_id - start], &value); sycl::group_barrier(m_item.get_group()); } - // Let the first subgroup do the final reduction - if (group_id == 0) { - const auto local_range = sg.get_local_range()[0]; - auto result = - reduction_array[id_in_sg < maximum_work_range ? id_in_sg : 0]; - // In case the maximum_work_range is larger than the range of the first - // subgroup, we first combine the items with a higher index. - for (unsigned int offset = local_range; offset < maximum_work_range; - offset += local_range) - if (id_in_sg + offset < maximum_work_range) - reducer.join(result, reduction_array[id_in_sg + offset]); - sycl::group_barrier(sg); - - // Now do the actual subgroup reduction. - const auto min_range = - std::min<unsigned int>(maximum_work_range, local_range); - for (unsigned int stride = 1; stride < min_range; stride <<= 1) { - const auto tmp = sg.shuffle_down(result, stride); - if (id_in_sg + stride < min_range) reducer.join(result, tmp); - } - if (id_in_sg == 0) reduction_array[0] = result; - } - sycl::group_barrier(m_item.get_group()); + // Do the final reduction for all threads redundantly + value = reduction_array[0]; + for (int i = 1; i < std::min(step_width, n_subgroups); ++i) + wrapped_reducer.join(&value, &reduction_array[i]); - reducer.reference() = reduction_array[0]; - // Make sure that the reduction array hasn't been modified in the meantime. - m_item.barrier(sycl::access::fence_space::local_space); + // Make sure that every thread is done using the reduction array. + sycl::group_barrier(m_item.get_group()); } //-------------------------------------------------------------------------- @@ -218,7 +235,8 @@ class SYCLTeamMember { // First combine the values in the same subgroup for (unsigned int stride = 1; vector_range * stride < sub_group_range; stride <<= 1) { - auto tmp = sg.shuffle_up(value, vector_range * stride); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, value, vector_range * stride); if (id_in_sg >= vector_range * stride) value += tmp; } @@ -244,7 +262,8 @@ class SYCLTeamMember { sub_group_range, n_active_subgroups - round * sub_group_range); auto local_value = base_data[idx]; for (unsigned int stride = 1; stride < upper_bound; stride <<= 1) { - auto tmp = sg.shuffle_up(local_value, stride); + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_right( + sg, local_value, stride); if (id_in_sg >= stride) { if (idx < n_active_subgroups) local_value += tmp; @@ -262,9 +281,10 @@ class SYCLTeamMember { } auto total = base_data[n_active_subgroups - 1]; - const auto update = sg.shuffle_up(value, vector_range); - Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + - (id_in_sg >= vector_range ? update : 0); + const auto update = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); + Type intermediate = (group_id > 0 ? base_data[group_id - 1] : Type{0}) + + (id_in_sg >= vector_range ? update : Type{0}); if (global_accum) { if (id_in_sg == sub_group_range - 1 && @@ -303,6 +323,19 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<SYCL>, ReducerType, + value_type>::Reducer; + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<WrappedReducerType>::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const { const auto tidx1 = m_item.get_local_id(1); const auto grange1 = m_item.get_local_range(1); @@ -311,13 +344,13 @@ class SYCLTeamMember { if (grange1 == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { - tmp2 = sg.shuffle_down(tmp, i); + tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast<int>(tidx1) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -326,9 +359,9 @@ class SYCLTeamMember { // because floating point summation is not associative // and thus different threads could have different results. - tmp2 = sg.shuffle(tmp, (sg.get_local_id() / grange1) * grange1); + tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( + sg, tmp, (sg.get_local_id() / grange1) * grange1); value = tmp2; - reducer.reference() = tmp2; } //---------------------------------------- @@ -337,7 +370,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr<void> shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl::device_ptr<void> scratch_level_1_ptr, + sycl_device_ptr<void> scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) @@ -522,8 +555,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::SYCLTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); @@ -532,7 +573,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel_reduce assuming summation. @@ -548,20 +591,28 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum<ValueType> reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::SYCLTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); i < loop_boundaries.end; i += loop_boundaries.member.item().get_local_range(0)) { - closure(i, val); + closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } /** \brief Inter-thread parallel exclusive prefix sum. @@ -648,8 +699,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::SYCLTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -661,8 +720,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< i < loop_boundaries.end; i += grange0 * grange1) closure(i, value); - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template <typename iType, class Closure, typename ValueType> @@ -670,10 +732,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum<ValueType> reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::SYCLTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -683,11 +751,13 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1; i < loop_boundaries.end; i += grange0 * grange1) - closure(i, val); + closure(i, value); + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -737,16 +807,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::SYCLTeamMember::execution_space>, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const iType grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, reducer.reference()); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Intra-thread vector parallel_reduce. @@ -765,16 +846,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!is_reducer<ValueType>::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::SYCLTeamMember::execution_space>, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const int grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, result); + closure(i, value); - loop_boundaries.member.vector_reduce(Kokkos::Sum<ValueType>(result)); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -834,7 +926,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< // [t] += [t-4] if t >= 4 // ... for (int j = 1; j < static_cast<int>(grange1); j <<= 1) { - value_type tmp = sg.shuffle_up(val, j); + value_type tmp = + Kokkos::Impl::SYCLReduction::shift_group_right(sg, val, j); if (j <= static_cast<int>(tidx1)) { reducer.join(val, tmp); } @@ -845,7 +938,8 @@ parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< // Update i's contribution into the val and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - accum = sg.shuffle(val, mask + vector_offset); + accum = Kokkos::Impl::SYCLReduction::select_from_group( + sg, val, mask + vector_offset); } reducer.reference() = accum; } @@ -922,7 +1016,8 @@ KOKKOS_INLINE_FUNCTION void single( const auto grange1 = item.get_local_range(1); const auto sg = item.get_sub_group(); if (item.get_local_id(1) == 0) lambda(val); - val = sg.shuffle(val, (sg.get_local_id() / grange1) * grange1); + val = Kokkos::Impl::SYCLReduction::select_from_group( + sg, val, (sg.get_local_id() / grange1) * grange1); } template <class FunctorType, class ValueType> diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 17ce59058bdda81124f67d7ca580604d3fd35ba5..556ca0d28186bf1f6668f868b976c858a672c2b7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -22,8 +22,7 @@ #include <vector> template <typename... Properties> -class Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::SYCL, - Properties...> +class Kokkos::Impl::TeamPolicyInternal<Kokkos::SYCL, Properties...> : public PolicyTraits<Properties...> { public: using execution_policy = TeamPolicyInternal; @@ -45,7 +44,7 @@ class Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::SYCL, bool m_tune_vector_length; public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; template <class... OtherProperties> TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index d55fc6a84ba4b05e292770919037af471c99001b..79d9e8a8d482c21d74eafdd4edc2c79259af4282 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -22,13 +22,14 @@ #include <Kokkos_UniqueToken.hpp> namespace Kokkos { -namespace Experimental { namespace Impl { Kokkos::View<uint32_t*, SYCLDeviceUSMSpace> sycl_global_unique_token_locks( bool deallocate = false); } +namespace Experimental { + // both global and instance Unique Tokens are implemented in the same way // the global version has one shared static lock array underneath // but it can't be a static member variable since we need to acces it on device @@ -42,7 +43,7 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> { using size_type = int32_t; explicit UniqueToken(execution_space const& = execution_space()) - : m_locks(Impl::sycl_global_unique_token_locks()) {} + : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} KOKKOS_DEFAULTED_FUNCTION UniqueToken(const UniqueToken&) = default; @@ -75,11 +76,15 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> { /// \brief acquire value such that 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type impl_acquire() const { +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); +#else auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); +#endif std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; @@ -122,11 +127,11 @@ class UniqueToken<SYCL, UniqueTokenScope::Instance> public: UniqueToken() : UniqueToken<SYCL, UniqueTokenScope::Global>( - Kokkos::Experimental::SYCL().concurrency()) {} + Kokkos::SYCL().concurrency()) {} explicit UniqueToken(execution_space const& arg) : UniqueToken<SYCL, UniqueTokenScope::Global>( - Kokkos::Experimental::SYCL().concurrency(), arg) {} + Kokkos::SYCL().concurrency(), arg) {} explicit UniqueToken(size_type max_size) : UniqueToken<SYCL, UniqueTokenScope::Global>(max_size) {} diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index c308384af090f1f395480ef49232f8987762cd07..abf0bd8f53e86b4737cf22f01a4874e927602b80 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -21,8 +21,53 @@ namespace Kokkos::Impl::SYCLReduction { -// FIXME_SYCL It appears that using shuffles is slower than going through local -// memory. +template <int N> +struct TrivialWrapper { + std::byte array[N]; +}; + +// shuffle down +template <typename T> +T shift_group_left(sycl::sub_group sg, T x, + sycl::sub_group::linear_id_type delta) { + if constexpr (std::is_trivially_copyable_v<T>) + return sycl::shift_group_left(sg, x, delta); + else { + auto tmp = sycl::shift_group_left( + sg, reinterpret_cast<TrivialWrapper<sizeof(T)>&>(x), delta); + return reinterpret_cast<T&>(tmp); + } +} + +// shuffle up +template <typename T> +T shift_group_right(sycl::sub_group sg, T x, + sycl::sub_group::linear_id_type delta) { + if constexpr (std::is_trivially_copyable_v<T>) + return sycl::shift_group_right(sg, x, delta); + else { + auto tmp = sycl::shift_group_right( + sg, reinterpret_cast<TrivialWrapper<sizeof(T)>&>(x), delta); + return reinterpret_cast<T&>(tmp); + } +} + +// shuffle +template <typename T> +T select_from_group(sycl::sub_group sg, T x, + sycl::sub_group::id_type remote_local_id) { + if constexpr (std::is_trivially_copyable_v<T>) + return sycl::select_from_group(sg, x, remote_local_id); + else { + auto tmp = sycl::select_from_group( + sg, reinterpret_cast<TrivialWrapper<sizeof(T)>&>(x), remote_local_id); + return reinterpret_cast<T&>(tmp); + } +} + +// FIXME_SYCL For some types, shuffle reductions are competitive with local +// memory reductions but they are significantly slower for the value type used +// in combined reductions with multiple double arguments. template <class ReducerType> inline constexpr bool use_shuffle_based_algorithm = false; // std::is_reference_v<typename ReducerType::reference_type>; @@ -30,7 +75,7 @@ inline constexpr bool use_shuffle_based_algorithm = false; template <typename ValueType, typename ReducerType, int dim> std::enable_if_t<!use_shuffle_based_algorithm<ReducerType>> workgroup_reduction( sycl::nd_item<dim>& item, sycl::local_accessor<ValueType> local_mem, - sycl::device_ptr<ValueType> results_ptr, + sycl_device_ptr<ValueType> results_ptr, sycl::global_ptr<ValueType> device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -102,24 +147,40 @@ std::enable_if_t<!use_shuffle_based_algorithm<ReducerType>> workgroup_reduction( template <typename ValueType, typename ReducerType, int dim> std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction( sycl::nd_item<dim>& item, sycl::local_accessor<ValueType> local_mem, - ValueType local_value, sycl::device_ptr<ValueType> results_ptr, + ValueType local_value, sycl_device_ptr<ValueType> results_ptr, sycl::global_ptr<ValueType> device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); // Perform the actual workgroup reduction in each subgroup // separately. - auto sg = item.get_sub_group(); - const int id_in_sg = sg.get_local_id()[0]; - const auto local_range = - std::min<unsigned int>(sg.get_local_range()[0], max_size); + auto sg = item.get_sub_group(); + const int id_in_sg = sg.get_local_id()[0]; + const int local_range = std::min<int>(sg.get_local_range()[0], max_size); const auto upper_stride_bound = - std::min<unsigned int>(local_range - id_in_sg, max_size - local_id); + std::min<int>(local_range - id_in_sg, max_size - local_id); +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine = [&](int stride) { + if (stride < local_range) { + auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left(sg, local_value, + stride); + if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); + } + }; + shuffle_combine(1); + shuffle_combine(2); + shuffle_combine(4); + shuffle_combine(8); + shuffle_combine(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { - auto tmp = sg.shuffle_down(local_value, stride); + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_left(sg, local_value, stride); if (stride < upper_stride_bound) final_reducer.join(&local_value, &tmp); } +#endif // Copy the subgroup results into the first positions of the // reduction array. @@ -140,7 +201,7 @@ std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction( // the first subgroup, we first combine the items with a higher // index. if (n_active_subgroups > local_range) { - for (unsigned int offset = local_range; offset < n_active_subgroups; + for (int offset = local_range; offset < n_active_subgroups; offset += local_range) if (id_in_sg + offset < n_active_subgroups) { final_reducer.join(&sg_value, &local_mem[(id_in_sg + offset)]); @@ -149,11 +210,29 @@ std::enable_if_t<use_shuffle_based_algorithm<ReducerType>> workgroup_reduction( } // Then, we proceed as before. +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + auto shuffle_combine_sg = [&](int stride) { + if (stride < local_range) { + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_left(sg, sg_value, stride); + if (id_in_sg + stride < n_active_subgroups) + final_reducer.join(&sg_value, &tmp); + } + }; + shuffle_combine_sg(1); + shuffle_combine_sg(2); + shuffle_combine_sg(4); + shuffle_combine_sg(8); + shuffle_combine_sg(16); + KOKKOS_ASSERT(local_range <= 32); +#else for (unsigned int stride = 1; stride < local_range; stride <<= 1) { - auto tmp = sg.shuffle_down(sg_value, stride); + auto tmp = + Kokkos::Impl::SYCLReduction::shift_group_left(sg, sg_value, stride); if (id_in_sg + stride < n_active_subgroups) final_reducer.join(&sg_value, &tmp); } +#endif // Finally, we copy the workgroup results back to global memory // to be used in the next iteration. If this is the last diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 9548f211d9e3fb090e2d10ca24ac77e5871df51e..2905733a4de985c8fb867b4c4e18db6455ead6ed 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -23,24 +23,16 @@ namespace Kokkos { namespace Impl { -template <class T, class... P> -struct ZeroMemset<Kokkos::Experimental::SYCL, View<T, P...>> { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type)); +template <> +struct ZeroMemset<Kokkos::SYCL> { + ZeroMemset(const Kokkos::SYCL& exec_space, void* dst, size_t cnt) { + auto event = + exec_space.impl_internal_space_instance()->m_queue->memset(dst, 0, cnt); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES exec_space.impl_internal_space_instance() ->m_queue->ext_oneapi_submit_barrier(std::vector<sycl::event>{event}); #endif } - - ZeroMemset(const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View<T, P...>::value_type)); - } }; } // namespace Impl diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp index e81e8349391e8ef5b51e80bd162ad77e82eba61e..44d797f1cccc27a736153fb638eabcb56df32fbf 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -35,6 +35,9 @@ namespace Kokkos { namespace Impl { +std::vector<SerialInternal*> SerialInternal::all_instances; +std::mutex SerialInternal::all_instances_mutex; + bool SerialInternal::is_initialized() { return m_is_initialized; } void SerialInternal::initialize() { @@ -43,6 +46,12 @@ void SerialInternal::initialize() { Impl::SharedAllocationRecord<void, void>::tracking_enable(); m_is_initialized = true; + + // guard pushing to all_instances + { + std::scoped_lock lock(all_instances_mutex); + all_instances.push_back(this); + } } void SerialInternal::finalize() { @@ -58,9 +67,18 @@ void SerialInternal::finalize() { m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0); } - Kokkos::Profiling::finalize(); - m_is_initialized = false; + + // guard erasing from all_instances + { + std::scoped_lock lock(all_instances_mutex); + auto it = std::find(all_instances.begin(), all_instances.end(), this); + if (it == all_instances.end()) + Kokkos::abort( + "Execution space instance to be removed couldn't be found!"); + std::swap(*it, all_instances.back()); + all_instances.pop_back(); + } } SerialInternal& SerialInternal::singleton() { @@ -99,9 +117,12 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, m_thread_team_data.disband_team(); m_thread_team_data.disband_pool(); - space.deallocate("Kokkos::Serial::scratch_mem", - m_thread_team_data.scratch_buffer(), - m_thread_team_data.scratch_bytes()); + // impl_deallocate doesn't fence which we try to avoid here since that + // interferes with the using the m_instance_mutex for ensuring proper + // kernel enqueuing + space.impl_deallocate("Kokkos::Serial::scratch_mem", + m_thread_team_data.scratch_buffer(), + m_thread_team_data.scratch_bytes()); } if (pool_reduce_bytes < old_pool_reduce) { @@ -121,13 +142,7 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, HostThreadTeamData::scratch_size(pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, thread_local_bytes); - void* ptr = nullptr; - try { - ptr = space.allocate("Kokkos::Serial::scratch_mem", alloc_bytes); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - // For now, just rethrow the error message the existing way - Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); - } + void* ptr = space.allocate("Kokkos::Serial::scratch_mem", alloc_bytes); m_thread_team_data.scratch_assign(static_cast<char*>(ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes, @@ -149,13 +164,15 @@ Serial::Serial(NewInstance) : m_space_instance(new Impl::SerialInternal, [](Impl::SerialInternal* ptr) { ptr->finalize(); delete ptr; - }) {} + }) { + m_space_instance->initialize(); +} void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS os << "Kokkos atomics disabled\n"; #endif diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp index db1567610b23bcd4b7ced931321e9c5c46b4496b..a1fa9e43e083b00862ddbca639c0548218ce1630 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -30,10 +30,10 @@ static_assert(false, #include <cstddef> #include <iosfwd> +#include <iterator> #include <mutex> #include <thread> #include <Kokkos_Core_fwd.hpp> -#include <Kokkos_TaskScheduler.hpp> #include <Kokkos_Layout.hpp> #include <Kokkos_HostSpace.hpp> #include <Kokkos_ScratchSpace.hpp> @@ -59,7 +59,10 @@ class SerialInternal { static SerialInternal& singleton(); - std::mutex m_thread_team_data_mutex; + std::mutex m_instance_mutex; + + static std::vector<SerialInternal*> all_instances; + static std::mutex all_instances_mutex; // Resize thread team data scratch memory void resize_thread_team_data(size_t pool_reduce_bytes, @@ -112,7 +115,15 @@ class Serial { Serial(); - Serial(NewInstance); + explicit Serial(NewInstance); + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + template <typename T = void> + KOKKOS_DEPRECATED_WITH_COMMENT( + "Serial execution space should be constructed explicitly.") + Serial(NewInstance) + : Serial(NewInstance{}) {} +#endif /// \brief True if and only if this method is being called in a /// thread-parallel function. @@ -120,7 +131,10 @@ class Serial { /// For the Serial device, this method <i>always</i> returns false, /// because parallel_for or parallel_reduce with the Serial device /// always execute sequentially. - inline static int in_parallel() { return false; } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static int in_parallel() { return false; } +#endif /// \brief Wait until all dispatched functors complete. /// @@ -133,7 +147,14 @@ class Serial { name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, - []() {}); // TODO: correct device ID + []() { + std::lock_guard<std::mutex> lock_all_instances( + Impl::SerialInternal::all_instances_mutex); + for (auto* instance_ptr : Impl::SerialInternal::all_instances) { + std::lock_guard<std::mutex> lock_instance( + instance_ptr->m_instance_mutex); + } + }); // TODO: correct device ID Kokkos::memory_fence(); } @@ -141,7 +162,10 @@ class Serial { "Kokkos::Serial::fence: Unnamed Instance Fence") const { Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - []() {}); // TODO: correct device ID + [this]() { + auto* internal_instance = this->impl_internal_space_instance(); + std::lock_guard<std::mutex> lock(internal_instance->m_instance_mutex); + }); // TODO: correct device ID Kokkos::memory_fence(); } @@ -242,7 +266,7 @@ template <class T> std::vector<Serial> partition_space(const Serial&, std::vector<T> const& weights) { static_assert( - std::is_arithmetic<T>::value, + std::is_arithmetic_v<T>, "Kokkos Error: partitioning arguments must be integers or floats"); // We only care about the number of instances to create and ignore weights @@ -259,7 +283,9 @@ std::vector<Serial> partition_space(const Serial&, #include <Serial/Kokkos_Serial_Parallel_Range.hpp> #include <Serial/Kokkos_Serial_Parallel_MDRange.hpp> #include <Serial/Kokkos_Serial_Parallel_Team.hpp> +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include <Serial/Kokkos_Serial_Task.hpp> +#endif #include <Serial/Kokkos_Serial_UniqueToken.hpp> #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 69787aa5001ab7104024a62976a51efe2d34e2ef..addcaba009fa2ce120cd576d1208b6b5ba68ef71 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP -#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP #include <Kokkos_Parallel.hpp> #include <KokkosExp_MDRangePolicy.hpp> @@ -43,7 +43,19 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, } public: - inline void execute() const { this->exec(); } + inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads + auto* internal_instance = + m_iter.m_rp.space().impl_internal_space_instance(); + std::lock_guard<std::mutex> lock(internal_instance->m_instance_mutex); +#endif + this->exec(); + } template <typename Policy, typename Functor> static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -104,9 +116,16 @@ class ParallelReduce<CombinedFunctorReducerType, auto* internal_instance = m_iter.m_rp.space().impl_internal_space_instance(); - // Need to lock resize_thread_team_data - std::lock_guard<std::mutex> lock( - internal_instance->m_thread_team_data_mutex); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard<std::mutex> instance_lock( + internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 56894716dbd7bc720da9fc640aa15f110ccd2455..2ab7b7f803486d3c55154277e0cb21797284f1e3 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP -#define KOKKO_SERIAL_PARALLEL_RANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_RANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_RANGE_HPP #include <Kokkos_Parallel.hpp> @@ -31,7 +31,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> { const Policy m_policy; template <class TagType> - std::enable_if_t<std::is_void<TagType>::value> exec() const { + std::enable_if_t<std::is_void_v<TagType>> exec() const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { m_functor(i); @@ -39,7 +39,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> { } template <class TagType> - std::enable_if_t<!std::is_void<TagType>::value> exec() const { + std::enable_if_t<!std::is_void_v<TagType>> exec() const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -49,6 +49,15 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> { public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads + auto* internal_instance = m_policy.space().impl_internal_space_instance(); + std::lock_guard<std::mutex> lock(internal_instance->m_instance_mutex); +#endif this->template exec<typename Policy::work_tag>(); } @@ -75,7 +84,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, const pointer_type m_result_ptr; template <class TagType> - inline std::enable_if_t<std::is_void<TagType>::value> exec( + inline std::enable_if_t<std::is_void_v<TagType>> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -84,7 +93,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline std::enable_if_t<!std::is_void<TagType>::value> exec( + inline std::enable_if_t<!std::is_void_v<TagType>> exec( reference_type update) const { const TagType t{}; @@ -103,9 +112,16 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); - // Need to lock resize_thread_team_data - std::lock_guard<std::mutex> lock( - internal_instance->m_thread_team_data_mutex); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard<std::mutex> instance_lock( + internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -160,7 +176,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, const Policy m_policy; template <class TagType> - inline std::enable_if_t<std::is_void<TagType>::value> exec( + inline std::enable_if_t<std::is_void_v<TagType>> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -169,7 +185,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline std::enable_if_t<!std::is_void<TagType>::value> exec( + inline std::enable_if_t<!std::is_void_v<TagType>> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -187,10 +203,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks - // Need to lock resize_thread_team_data auto* internal_instance = m_policy.space().impl_internal_space_instance(); - std::lock_guard<std::mutex> lock( - internal_instance->m_thread_team_data_mutex); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard<std::mutex> instance_lock( + internal_instance->m_instance_mutex); +#endif + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -227,7 +251,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, const pointer_type m_result_ptr; template <class TagType> - inline std::enable_if_t<std::is_void<TagType>::value> exec( + inline std::enable_if_t<std::is_void_v<TagType>> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -236,7 +260,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline std::enable_if_t<!std::is_void<TagType>::value> exec( + inline std::enable_if_t<!std::is_void_v<TagType>> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -253,10 +277,18 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks - // Need to lock resize_thread_team_data auto* internal_instance = m_policy.space().impl_internal_space_instance(); - std::lock_guard<std::mutex> lock( - internal_instance->m_thread_team_data_mutex); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard<std::mutex> instance_lock( + internal_instance->m_instance_mutex); +#endif + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 0876f1af229d764ca4bebf752f819b75282f1c15..7a6faf3d9fb565f622c761107acdb1aa0d9066da 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP -#define KOKKO_SERIAL_PARALLEL_TEAM_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_TEAM_HPP +#define KOKKOS_SERIAL_PARALLEL_TEAM_HPP #include <Kokkos_Parallel.hpp> @@ -37,6 +37,8 @@ class TeamPolicyInternal<Kokkos::Serial, Properties...> int m_league_size; int m_chunk_size; + Kokkos::Serial m_space; + public: //! Tag this class as a kokkos execution policy using execution_policy = TeamPolicyInternal; @@ -46,10 +48,7 @@ class TeamPolicyInternal<Kokkos::Serial, Properties...> //! Execution space of this execution policy: using execution_space = Kokkos::Serial; - const typename traits::execution_space& space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space& space() const { return m_space; } template <class ExecSpace, class... OtherProperties> friend class TeamPolicyInternal; @@ -116,12 +115,13 @@ class TeamPolicyInternal<Kokkos::Serial, Properties...> return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024); } /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space&, int league_size_request, + TeamPolicyInternal(const execution_space& space, int league_size_request, int team_size_request, int /* vector_length_request */ = 1) : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0}, m_league_size(league_size_request), - m_chunk_size(32) { + m_chunk_size(32), + m_space(space) { if (team_size_request > 1) Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); } @@ -223,7 +223,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, const size_t m_shared; template <class TagType> - inline std::enable_if_t<std::is_void<TagType>::value> exec( + inline std::enable_if_t<std::is_void_v<TagType>> exec( HostThreadTeamData& data) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor(Member(data, ileague, m_league)); @@ -231,7 +231,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, } template <class TagType> - inline std::enable_if_t<!std::is_void<TagType>::value> exec( + inline std::enable_if_t<!std::is_void_v<TagType>> exec( HostThreadTeamData& data) const { const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { @@ -247,9 +247,17 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); - // Need to lock resize_thread_team_data - std::lock_guard<std::mutex> lock( - internal_instance->m_thread_team_data_mutex); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard<std::mutex> instance_lock( + internal_instance->m_instance_mutex); +#endif + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -291,7 +299,7 @@ class ParallelReduce<CombinedFunctorReducerType, size_t m_shared; template <class TagType> - inline std::enable_if_t<std::is_void<TagType>::value> exec( + inline std::enable_if_t<std::is_void_v<TagType>> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); @@ -299,7 +307,7 @@ class ParallelReduce<CombinedFunctorReducerType, } template <class TagType> - inline std::enable_if_t<!std::is_void<TagType>::value> exec( + inline std::enable_if_t<!std::is_void_v<TagType>> exec( HostThreadTeamData& data, reference_type update) const { const TagType t{}; @@ -319,9 +327,17 @@ class ParallelReduce<CombinedFunctorReducerType, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); - // Need to lock resize_thread_team_data - std::lock_guard<std::mutex> lock( - internal_instance->m_thread_team_data_mutex); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + // Make sure kernels are running sequentially even when using multiple + // threads, lock resize_thread_team_data + std::lock_guard<std::mutex> instance_lock( + internal_instance->m_instance_mutex); +#endif + internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index f9c86f55ce05f92e074e37770efc3911830f5e87..678d18250474cc3bc7c1647c71ba09f707fee704 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -25,10 +25,16 @@ #include <Serial/Kokkos_Serial.hpp> #include <impl/Kokkos_HostThreadTeam.hpp> #include <impl/Kokkos_TaskQueue.hpp> +#include <impl/Kokkos_TaskTeamMember.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -102,9 +108,8 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType>> { template <class Scheduler> class TaskQueueSpecializationConstrained< - Scheduler, - std::enable_if_t<std::is_same<typename Scheduler::execution_space, - Kokkos::Serial>::value>> { + Scheduler, std::enable_if_t<std::is_same_v< + typename Scheduler::execution_space, Kokkos::Serial>>> { public: // Note: Scheduler may be an incomplete type at class scope (but not inside // of the methods, obviously) @@ -121,7 +126,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast<task_base_type*>(task_base_type::EndTag); execution_space serial_execution_space; auto& data = serial_execution_space.impl_internal_space_instance() @@ -157,7 +162,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast<task_base_type*>(task_base_type::EndTag); execution_space serial_execution_space; @@ -215,6 +220,10 @@ extern template class TaskQueue<Kokkos::Serial, } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp index 7e5cd8d88a3da1af62f8c76eba05e3bcff57b837..57f70276325f80dea5cbe98e7c00e6ee394eae0b 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_WorkGraphPolicy.hpp @@ -30,13 +30,13 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, FunctorType m_functor; template <class TagType> - std::enable_if_t<std::is_void<TagType>::value> exec_one( + std::enable_if_t<std::is_void_v<TagType>> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template <class TagType> - std::enable_if_t<!std::is_void<TagType>::value> exec_one( + std::enable_if_t<!std::is_void_v<TagType>> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 3ec2dfbcfa0a83c20e9d02042c958890074a6c22..527e09407989c3ae374e04c674ae6a44acd020ff 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -22,6 +22,7 @@ #include <Serial/Kokkos_Serial.hpp> #include <type_traits> +#include <cstring> namespace Kokkos { namespace Impl { @@ -30,18 +31,11 @@ namespace Impl { // parallel execution space since the specialization for // DefaultHostExecutionSpace is defined elsewhere. struct DummyExecutionSpace; -template <class T, class... P> +template <> struct ZeroMemset< - std::conditional_t<!std::is_same<Serial, DefaultHostExecutionSpace>::value, - Serial, DummyExecutionSpace>, - View<T, P...>> - : public ZeroMemset<DefaultHostExecutionSpace, View<T, P...>> { - using Base = ZeroMemset<DefaultHostExecutionSpace, View<T, P...>>; - using Base::Base; - - ZeroMemset(const Serial&, const View<T, P...>& dst, - typename View<T, P...>::const_value_type& value) - : Base(dst, value) {} + std::conditional_t<!std::is_same_v<Serial, DefaultHostExecutionSpace>, + Serial, DummyExecutionSpace>> { + ZeroMemset(const Serial&, void* dst, size_t cnt) { std::memset(dst, 0, cnt); } }; } // namespace Impl diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp index c0d70c03ecbef0fb10d2757b0fa9fad66235d817..31653c46cac379922d9d2ed8ee52081927d189d7 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp @@ -38,15 +38,6 @@ static_assert(false, /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { -class ThreadsExec; -enum class fence_is_static { yes, no }; -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /** \brief Execution space for a pool of C++11 threads on a CPU. */ @@ -73,7 +64,9 @@ class Threads { /// \brief True if and only if this method is being called in a /// thread-parallel function. - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp similarity index 55% rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index c754091e87e0423790c6b16f86658c8021c136a0..edc9489f67e798728c40d6d30593a0d4451754d0 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -16,17 +16,15 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE +#include "Threads/Kokkos_Threads_Instance.hpp" #endif #include <Kokkos_Macros.hpp> -#include <cstdint> -#include <limits> #include <utility> #include <iostream> #include <sstream> #include <thread> -#include <mutex> #include <Kokkos_Core.hpp> @@ -41,7 +39,6 @@ namespace Kokkos { namespace Impl { namespace { -std::mutex host_internal_cppthread_mutex; // std::thread compatible driver. // Recovery from an exception would require constant intra-thread health @@ -49,7 +46,7 @@ std::mutex host_internal_cppthread_mutex; // abort the process. void internal_cppthread_driver() { try { - ThreadsExec::driver(); + ThreadsInternal::driver(); } catch (const std::exception &x) { std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl; @@ -62,31 +59,17 @@ void internal_cppthread_driver() { } } -ThreadsExec s_threads_process; -ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr}; -std::thread::id s_threads_pid[ThreadsExec::MAX_THREAD_COUNT]; -std::pair<unsigned, unsigned> s_threads_coord[ThreadsExec::MAX_THREAD_COUNT]; +ThreadsInternal s_threads_process; +ThreadsInternal *s_threads_exec[ThreadsInternal::MAX_THREAD_COUNT] = {nullptr}; +std::thread::id s_threads_pid[ThreadsInternal::MAX_THREAD_COUNT]; +std::pair<unsigned, unsigned> + s_threads_coord[ThreadsInternal::MAX_THREAD_COUNT]; int s_thread_pool_size[3] = {0, 0, 0}; -unsigned s_current_reduce_size = 0; -unsigned s_current_shared_size = 0; - -void (*volatile s_current_function)(ThreadsExec &, const void *); -const void *volatile s_current_function_arg = nullptr; - -struct Sentinel { - ~Sentinel() { - if (s_thread_pool_size[0] || s_thread_pool_size[1] || - s_thread_pool_size[2] || s_current_reduce_size || - s_current_shared_size || s_current_function || s_current_function_arg || - s_threads_exec[0]) { - std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " - "initialized" - << std::endl; - } - } -}; +using s_current_function_type = void (*)(ThreadsInternal &, const void *); +std::atomic<s_current_function_type> s_current_function; +std::atomic<const void *> s_current_function_arg = nullptr; inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); @@ -97,6 +80,12 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } +void wait_yield(std::atomic<ThreadState> &flag, const ThreadState value) { + while (value == flag) { + std::this_thread::yield(); + } +} + } // namespace } // namespace Impl } // namespace Kokkos @@ -107,151 +96,116 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { namespace Kokkos { namespace Impl { -//---------------------------------------------------------------------------- -// Spawn a thread - -void ThreadsExec::spawn() { - std::thread t(internal_cppthread_driver); - t.detach(); -} - -//---------------------------------------------------------------------------- - -bool ThreadsExec::is_process() { +bool ThreadsInternal::is_process() { static const std::thread::id master_pid = std::this_thread::get_id(); return master_pid == std::this_thread::get_id(); } -void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); } - -void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); } - //---------------------------------------------------------------------------- -void ThreadsExec::wait_yield(volatile int &flag, const int value) { - while (value == flag) { - std::this_thread::yield(); - } -} - -void execute_function_noop(ThreadsExec &, const void *) {} +void execute_function_noop(ThreadsInternal &, const void *) {} -void ThreadsExec::driver() { +void ThreadsInternal::driver() { SharedAllocationRecord<void, void>::tracking_enable(); - ThreadsExec this_thread; + ThreadsInternal this_thread; - while (ThreadsExec::Active == this_thread.m_pool_state) { + while (this_thread.m_pool_state == ThreadState::Active) { (*s_current_function)(this_thread, s_current_function_arg); // Deactivate thread and wait for reactivation - this_thread.m_pool_state = ThreadsExec::Inactive; + this_thread.m_pool_state = ThreadState::Inactive; - wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive); + wait_yield(this_thread.m_pool_state, ThreadState::Inactive); } } -ThreadsExec::ThreadsExec() +ThreadsInternal::ThreadsInternal() : m_pool_base(nullptr), m_scratch(nullptr), m_scratch_reduce_end(0), m_scratch_thread_end(0), - m_numa_rank(0), - m_numa_core_rank(0), m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), - m_pool_state(ThreadsExec::Terminating) { + m_pool_state(ThreadState::Terminating) { if (&s_threads_process != this) { - // A spawned thread - - ThreadsExec *const nil = nullptr; + // The code in the if is executed by a spawned thread not by the root + // thread + ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding - const int entry = reinterpret_cast<size_t>(s_current_function_arg) < - size_t(s_thread_pool_size[0]) - ? reinterpret_cast<size_t>(s_current_function_arg) - : size_t(Kokkos::hwloc::bind_this_thread( - s_thread_pool_size[0], s_threads_coord)); + const int entry = + reinterpret_cast<size_t>(s_current_function_arg.load()) < + size_t(s_thread_pool_size[0]) + ? reinterpret_cast<size_t>(s_current_function_arg.load()) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) { - const std::pair<unsigned, unsigned> coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - m_numa_rank = coord.first; - m_numa_core_rank = coord.second; - m_pool_base = s_threads_exec; - m_pool_rank = s_thread_pool_size[0] - (entry + 1); - m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); - m_pool_size = s_thread_pool_size[0]; - m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadsExec::Active; + m_pool_base = s_threads_exec; + m_pool_rank = s_thread_pool_size[0] - (entry + 1); + m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); + m_pool_size = s_thread_pool_size[0]; + m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); + m_pool_state = ThreadState::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); // Inform spawning process that the threads_exec entry has been set. - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; } else { // Inform spawning process that the threads_exec entry could not be set. - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0; m_pool_size = 1; - m_pool_state = ThreadsExec::Inactive; + m_pool_state = ThreadState::Inactive; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); } } -ThreadsExec::~ThreadsExec() { +ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); - using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; - if (m_scratch) { - Record *const r = Record::get_record(m_scratch); - + Kokkos::kokkos_free<Kokkos::HostSpace>(m_scratch); m_scratch = nullptr; - - Record::decrement(r); } m_pool_base = nullptr; m_scratch_reduce_end = 0; m_scratch_thread_end = 0; - m_numa_rank = 0; - m_numa_core_rank = 0; m_pool_rank = 0; m_pool_size = 0; m_pool_fan_size = 0; - m_pool_state = ThreadsExec::Terminating; + m_pool_state = ThreadState::Terminating; if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { - ThreadsExec *const nil = nullptr; + ThreadsInternal *const nil = nullptr; atomic_compare_exchange(s_threads_exec + entry, this, nil); - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } -int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; } - -ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { - ThreadsExec *const th = +ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { + ThreadsInternal *const th = init_thread_rank < s_thread_pool_size[0] ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)] : nullptr; if (nullptr == th || th->m_pool_rank != init_thread_rank) { std::ostringstream msg; - msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + msg << "Kokkos::Impl::ThreadsInternal::get_thread ERROR : " << "thread " << init_thread_rank << " of " << s_thread_pool_size[0]; if (nullptr == th) { msg << " does not exist"; @@ -264,24 +218,6 @@ ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { return th; } -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { - ThreadsExec::global_lock(); - ThreadsExec::global_unlock(); - - const int n = exec.m_pool_fan_size; - const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); - - for (int i = 0; i < n; ++i) { - Impl::spinwait_while_equal<int>( - exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadsExec::Active); - } - - exec.m_pool_state = ThreadsExec::Inactive; -} - } // namespace Impl } // namespace Kokkos @@ -290,8 +226,8 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { namespace Kokkos { namespace Impl { -void ThreadsExec::verify_is_process(const std::string &name, - const bool initialized) { +void ThreadsInternal::verify_is_process(const std::string &name, + const bool initialized) { if (!is_process()) { std::string msg(name); msg.append( @@ -307,63 +243,48 @@ void ThreadsExec::verify_is_process(const std::string &name, } } -int ThreadsExec::in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() { // A thread function is in execution and // the function argument is not the special threads process argument and // the master process is a worker or is not the master process. return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } -void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); } -void ThreadsExec::fence(const std::string &name) { - internal_fence(name, Impl::fence_is_static::yes); +#endif +void ThreadsInternal::fence() { + fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } - -void ThreadsExec::internal_fence(Impl::fence_is_static is_static) { - internal_fence((is_static == Impl::fence_is_static::no) - ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence" - : "Kokkos::ThreadsExec::fence: Unnamed Static Fence", - is_static); +void ThreadsInternal::fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + internal_fence); } // Wait for root thread to become inactive -void ThreadsExec::internal_fence(const std::string &name, - Impl::fence_is_static is_static) { - const auto &fence_lam = [&]() { - if (s_thread_pool_size[0]) { - // Wait for the root thread to complete: - Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state, - ThreadsExec::Active); - } +void ThreadsInternal::internal_fence() { + if (s_thread_pool_size[0]) { + // Wait for the root thread to complete: + Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, + ThreadState::Active); + } - s_current_function = nullptr; - s_current_function_arg = nullptr; + s_current_function = nullptr; + s_current_function_arg = nullptr; - // Make sure function and arguments are cleared before - // potentially re-activating threads with a subsequent launch. - memory_fence(); - }; - if (is_static == Impl::fence_is_static::yes) { - Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - fence_lam); - } else { - Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - fence_lam); - } + // Make sure function and arguments are cleared before + // potentially re-activating threads with a subsequent launch. + memory_fence(); } /** \brief Begin execution of the asynchronous functor */ -void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), - const void *arg) { - verify_is_process("ThreadsExec::start", true); +void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), + const void *arg) { + verify_is_process("ThreadsInternal::start", true); if (s_current_function || s_current_function_arg) { Kokkos::Impl::throw_runtime_exception( - std::string("ThreadsExec::start() FAILED : already executing")); + std::string("ThreadsInternal::start() FAILED : already executing")); } s_current_function = func; @@ -372,68 +293,29 @@ void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), // Make sure function and arguments are written before activating threads. memory_fence(); - // Activate threads: + // Activate threads. The spawned threads will start working on + // s_current_function. The root thread is only set to active, we still need to + // call s_current_function. for (int i = s_thread_pool_size[0]; 0 < i--;) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Active; + s_threads_exec[i]->m_pool_state = ThreadState::Active; } if (s_threads_process.m_pool_size) { // Master process is the root thread, run it: (*func)(s_threads_process, arg); - s_threads_process.m_pool_state = ThreadsExec::Inactive; - } -} - -//---------------------------------------------------------------------------- - -bool ThreadsExec::sleep() { - verify_is_process("ThreadsExec::sleep", true); - - if (&execute_sleep == s_current_function) return false; - - fence(); - - ThreadsExec::global_lock(); - - s_current_function = &execute_sleep; - - // Activate threads: - for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadsExec::Active; - } - - return true; -} - -bool ThreadsExec::wake() { - verify_is_process("ThreadsExec::wake", true); - - if (&execute_sleep != s_current_function) return false; - - ThreadsExec::global_unlock(); - - if (s_threads_process.m_pool_base) { - execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } - - fence(); - - return true; } //---------------------------------------------------------------------------- -void ThreadsExec::execute_resize_scratch_in_serial() { +void ThreadsInternal::execute_resize_scratch_in_serial() { const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; - auto deallocate_scratch_memory = [](ThreadsExec &exec) { + auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { - using Record = - Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; - Record *const r = Record::get_record(exec.m_scratch); - exec.m_scratch = nullptr; - Record::decrement(r); + Kokkos::kokkos_free<Kokkos::HostSpace>(exec.m_scratch); + exec.m_scratch = nullptr; } }; if (s_threads_process.m_pool_base) { @@ -449,18 +331,18 @@ void ThreadsExec::execute_resize_scratch_in_serial() { memory_fence(); for (unsigned i = s_thread_pool_size[0]; begin < i;) { - ThreadsExec &th = *s_threads_exec[--i]; + ThreadsInternal &th = *s_threads_exec[--i]; - th.m_pool_state = ThreadsExec::Active; + th.m_pool_state = ThreadState::Active; - wait_yield(th.m_pool_state, ThreadsExec::Active); + wait_yield(th.m_pool_state, ThreadState::Active); } if (s_threads_process.m_pool_base) { deallocate_scratch_memory(s_threads_process); - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; first_touch_allocate_thread_private_scratch(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_current_function_arg = nullptr; @@ -472,27 +354,20 @@ void ThreadsExec::execute_resize_scratch_in_serial() { //---------------------------------------------------------------------------- -void *ThreadsExec::root_reduce_scratch() { +void *ThreadsInternal::root_reduce_scratch() { return s_threads_process.reduce_memory(); } -void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, - const void *) { +void ThreadsInternal::first_touch_allocate_thread_private_scratch( + ThreadsInternal &exec, const void *) { exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end; if (s_threads_process.m_scratch_thread_end) { // Allocate tracked memory: { - using Record = - Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; - Record *const r = - Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch", - s_threads_process.m_scratch_thread_end); - - Record::increment(r); - - exec.m_scratch = r->data(); + exec.m_scratch = Kokkos::kokkos_malloc<Kokkos::HostSpace>( + "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end); } unsigned *ptr = reinterpret_cast<unsigned *>(exec.m_scratch); @@ -505,7 +380,7 @@ void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, } } -void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { +void *ThreadsInternal::resize_scratch(size_t reduce_size, size_t thread_size) { enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; fence(); @@ -522,7 +397,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) || ((reduce_size == 0 && thread_size == 0) && (old_reduce_size != 0 || old_thread_size != 0))) { - verify_is_process("ThreadsExec::resize_scratch", true); + verify_is_process("ThreadsInternal::resize_scratch", true); s_threads_process.m_scratch_reduce_end = reduce_size; s_threads_process.m_scratch_thread_end = reduce_size + thread_size; @@ -537,27 +412,22 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { //---------------------------------------------------------------------------- -void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { - verify_is_process("ThreadsExec::print_configuration", false); +void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { + verify_is_process("ThreadsInternal::print_configuration", false); fence(); - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = - Kokkos::hwloc::get_available_threads_per_core(); - - // Forestall compiler warnings for unused variables. - (void)numa_count; - (void)cores_per_numa; - (void)threads_per_core; - s << "Kokkos::Threads"; #if defined(KOKKOS_ENABLE_THREADS) s << " KOKKOS_ENABLE_THREADS"; #endif #if defined(KOKKOS_ENABLE_HWLOC) + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = + Kokkos::hwloc::get_available_threads_per_core(); + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"; #endif @@ -569,25 +439,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { if (nullptr == s_threads_process.m_pool_base) { s << " Asynchronous"; } - s << " ReduceScratch[" << s_current_reduce_size << "]" - << " SharedScratch[" << s_current_shared_size << "]"; s << std::endl; if (detail) { for (int i = 0; i < s_thread_pool_size[0]; ++i) { - ThreadsExec *const th = s_threads_exec[i]; + ThreadsInternal *const th = s_threads_exec[i]; if (th) { const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); - s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." - << th->m_numa_core_rank << " ]"; + s << " Thread[ " << th->m_pool_rank << " ]"; s << " Fan{"; for (int j = 0; j < th->m_pool_fan_size; ++j) { - ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)]; - s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank - << "." << thfan->m_numa_core_rank << " ]"; + ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)]; + s << " [ " << thfan->m_pool_rank << " ]"; } s << " }"; @@ -605,29 +471,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { //---------------------------------------------------------------------------- -int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; } +int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; } -void ThreadsExec::initialize(int thread_count_arg) { - // legacy arguments - unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; - unsigned use_numa_count = 0; - unsigned use_cores_per_numa = 0; - bool allow_asynchronous_threadpool = false; - // need to provide an initializer for Intel compilers - static const Sentinel sentinel = {}; +void ThreadsInternal::initialize(int thread_count_arg) { + unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; const bool is_initialized = 0 != s_thread_pool_size[0]; unsigned thread_spawn_failed = 0; - for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++) + for (int i = 0; i < ThreadsInternal::MAX_THREAD_COUNT; i++) s_threads_exec[i] = nullptr; if (!is_initialized) { - // If thread_count, use_numa_count, or use_cores_per_numa are zero - // then they will be given default values based upon hwloc detection - // and allowed asynchronous execution. - + // If thread_count is zero then it will be given default values based upon + // hwloc detection. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); @@ -640,17 +498,18 @@ void ThreadsExec::initialize(int thread_count_arg) { : 1; } - const unsigned thread_spawn_begin = hwloc::thread_mapping( - "Kokkos::Threads::initialize", allow_asynchronous_threadpool, - thread_count, use_numa_count, use_cores_per_numa, s_threads_coord); + const bool allow_asynchronous_threadpool = false; + unsigned use_numa_count = 0; + unsigned use_cores_per_numa = 0; + hwloc::thread_mapping("Kokkos::Threads::initialize", + allow_asynchronous_threadpool, thread_count, + use_numa_count, use_cores_per_numa, s_threads_coord); const std::pair<unsigned, unsigned> proc_coord = s_threads_coord[0]; - if (thread_spawn_begin) { - // Synchronous with s_threads_coord[0] as the process core - // Claim entry #0 for binding the process core. - s_threads_coord[0] = std::pair<unsigned, unsigned>(~0u, ~0u); - } + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair<unsigned, unsigned>(~0u, ~0u); s_thread_pool_size[0] = thread_count; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count; @@ -658,8 +517,8 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = &execute_function_noop; // Initialization work function - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { - s_threads_process.m_pool_state = ThreadsExec::Inactive; + for (unsigned ith = 1; ith < thread_count; ++ith) { + s_threads_process.m_pool_state = ThreadState::Inactive; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' @@ -675,18 +534,20 @@ void ThreadsExec::initialize(int thread_count_arg) { // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. - ThreadsExec::spawn(); - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); - if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break; + std::thread t(internal_cppthread_driver); + t.detach(); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); + if (s_threads_process.m_pool_state == ThreadState::Terminating) break; } // Wait for all spawned threads to deactivate before zeroing the function. - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. - ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith]; + ThreadsInternal *const th = + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { - wait_yield(th->m_pool_state, ThreadsExec::Active); + wait_yield(th->m_pool_state, ThreadState::Active); } else { ++thread_spawn_failed; } @@ -694,7 +555,7 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = nullptr; s_current_function_arg = nullptr; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; memory_fence(); @@ -705,30 +566,17 @@ void ThreadsExec::initialize(int thread_count_arg) { Kokkos::hwloc::bind_this_thread(proc_coord); } - if (thread_spawn_begin) { // Include process in pool. - const std::pair<unsigned, unsigned> coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - s_threads_exec[0] = &s_threads_process; - s_threads_process.m_numa_rank = coord.first; - s_threads_process.m_numa_core_rank = coord.second; - s_threads_process.m_pool_base = s_threads_exec; - s_threads_process.m_pool_rank = - thread_count - 1; // Reversed for scan-compatible reductions - s_threads_process.m_pool_size = thread_count; - s_threads_process.m_pool_fan_size = fan_size( - s_threads_process.m_pool_rank, s_threads_process.m_pool_size); - s_threads_pid[s_threads_process.m_pool_rank] = - std::this_thread::get_id(); - } else { - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 0; - s_threads_process.m_pool_fan_size = 0; - } + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_pool_base = s_threads_exec; + s_threads_process.m_pool_rank = + thread_count - 1; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count; + s_threads_process.m_pool_fan_size = fan_size( + s_threads_process.m_pool_rank, s_threads_process.m_pool_size); + s_threads_pid[s_threads_process.m_pool_rank] = std::this_thread::get_id(); // Initial allocations: - ThreadsExec::resize_scratch(1024, 1024); + ThreadsInternal::resize_scratch(1024, 1024); } else { s_thread_pool_size[0] = 0; s_thread_pool_size[1] = 0; @@ -773,8 +621,8 @@ void ThreadsExec::initialize(int thread_count_arg) { //---------------------------------------------------------------------------- -void ThreadsExec::finalize() { - verify_is_process("ThreadsExec::finalize", false); +void ThreadsInternal::finalize() { + verify_is_process("ThreadsInternal::finalize", false); fence(); @@ -784,18 +632,18 @@ void ThreadsExec::finalize() { for (unsigned i = s_thread_pool_size[0]; begin < i--;) { if (s_threads_exec[i]) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating; + s_threads_exec[i]->m_pool_state = ThreadState::Terminating; - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_threads_pid[i] = std::thread::id(); } if (s_threads_process.m_pool_base) { - (&s_threads_process)->~ThreadsExec(); + (&s_threads_process)->~ThreadsInternal(); s_threads_exec[0] = nullptr; } @@ -808,15 +656,11 @@ void ThreadsExec::finalize() { s_thread_pool_size[2] = 0; // Reset master thread to run solo. - s_threads_process.m_numa_rank = 0; - s_threads_process.m_numa_core_rank = 0; - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 1; - s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadsExec::Inactive; - - Kokkos::Profiling::finalize(); + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 1; + s_threads_process.m_pool_fan_size = 0; + s_threads_process.m_pool_state = ThreadState::Inactive; } //---------------------------------------------------------------------------- @@ -836,7 +680,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); } #endif void Threads::fence(const std::string &name) const { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no); + Impl::ThreadsInternal::fence(name); } Threads &Threads::impl_instance(int) { diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp similarity index 76% rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index 377e096bfbebc6df30ca024c82aef2275e0c68a4..130b3433d02687c3e817f8504bf1f851d1b4daf2 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_THREADSEXEC_HPP -#define KOKKOS_THREADSEXEC_HPP +#ifndef KOKKOS_THREADS_INSTANCE_HPP +#define KOKKOS_THREADS_INSTANCE_HPP #include <Kokkos_Macros.hpp> @@ -23,41 +23,25 @@ #include <ostream> #include <utility> -#include <impl/Kokkos_Spinwait.hpp> - #include <Kokkos_Atomic.hpp> #include <Kokkos_Pair.hpp> #include <impl/Kokkos_ConcurrentBitset.hpp> #include <Threads/Kokkos_Threads.hpp> +#include <Threads/Kokkos_Threads_Spinwait.hpp> +#include <Threads/Kokkos_Threads_State.hpp> //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -class ThreadsExec { +class ThreadsInternal { public: // Fan array has log_2(NT) reduction threads plus 2 scan threads // Currently limited to 16k threads. - enum { MAX_FAN_COUNT = 16 }; - enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) }; - enum { VECTOR_LENGTH = 8 }; - - /** \brief States of a worker thread */ - enum { - Terminating ///< Termination in progress - , - Inactive ///< Exists, waiting for work - , - Active ///< Exists, performing work - , - Rendezvous ///< Exists, waiting in a barrier or reduce - - , - ScanCompleted, - ScanAvailable, - ReductionAvailable - }; + static constexpr int MAX_FAN_COUNT = 16; + static constexpr int MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2); + static constexpr int VECTOR_LENGTH = 8; private: friend class Kokkos::Threads; @@ -67,18 +51,16 @@ class ThreadsExec { // the threads that need them. // For a simple reduction the thread location is arbitrary. - ThreadsExec *const *m_pool_base; ///< Base for pool fan-in + ThreadsInternal *const *m_pool_base; ///< Base for pool fan-in void *m_scratch; int m_scratch_reduce_end; size_t m_scratch_thread_end; - int m_numa_rank; - int m_numa_core_rank; int m_pool_rank; int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - int volatile m_pool_state; ///< State for global synchronizations + std::atomic<ThreadState> m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -93,41 +75,36 @@ class ThreadsExec { static void global_lock(); static void global_unlock(); - static void spawn(); - static void first_touch_allocate_thread_private_scratch(ThreadsExec &, + static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - static void execute_sleep(ThreadsExec &, const void *); - ThreadsExec(const ThreadsExec &); - ThreadsExec &operator=(const ThreadsExec &); + ThreadsInternal(const ThreadsInternal &); + ThreadsInternal &operator=(const ThreadsInternal &); static void execute_resize_scratch_in_serial(); public: KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; } KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; } - KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; } - KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; } inline long team_work_index() const { return m_team_work_index; } - static int get_thread_count(); - static ThreadsExec *get_thread(const int init_thread_rank); + static ThreadsInternal *get_thread(const int init_thread_rank); inline void *reduce_memory() const { return m_scratch; } KOKKOS_INLINE_FUNCTION void *scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } - KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const { + KOKKOS_INLINE_FUNCTION auto &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } static void driver(void); - ~ThreadsExec(); - ThreadsExec(); + ~ThreadsInternal(); + ThreadsInternal(); static void *resize_scratch(size_t reduce_size, size_t thread_size); @@ -143,15 +120,8 @@ class ThreadsExec { static void finalize(); - /* Given a requested team size, return valid team size */ - static unsigned team_size_valid(unsigned); - static void print_configuration(std::ostream &, const bool detail = false); - //------------------------------------ - - static void wait_yield(volatile int &, const int); - //------------------------------------ // All-thread functions: @@ -166,14 +136,14 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal<int>( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast @@ -191,7 +161,7 @@ class ThreadsExec { memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } @@ -207,21 +177,21 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal<int>( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } } @@ -234,9 +204,9 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; - Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join( reinterpret_cast<typename FunctorType::value_type *>(reduce_memory()), @@ -255,7 +225,7 @@ class ThreadsExec { // to inactive triggers another thread to exit a spinwait // and read the 'reduce_memory'. // Must 'memory_fence()' to guarantee that storing the update to - // 'reduce_memory()' will complete before storing the the update to + // 'reduce_memory()' will complete before storing the update to // 'm_pool_state'. memory_fence(); @@ -265,8 +235,8 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal<int>( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } } @@ -289,10 +259,10 @@ class ThreadsExec { //-------------------------------- // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: Active -> ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join(work_value, fan.reduce_memory()); } @@ -303,39 +273,37 @@ class ThreadsExec { if (rev_rank) { // Set: Active -> ReductionAvailable - m_pool_state = ThreadsExec::ReductionAvailable; + m_pool_state = ThreadState::ReductionAvailable; // Wait for contributing threads' scan value to be available. if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { - ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; + ThreadsInternal &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal<int>(th.m_pool_state, ThreadsExec::Active); - Impl::spinwait_while_equal<int>(th.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(th.m_pool_state, ThreadState::Active); + spinwait_while_equal(th.m_pool_state, ThreadState::ReductionAvailable); f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count); } // This thread has completed inclusive scan // Set: ReductionAvailable -> ScanAvailable - m_pool_state = ThreadsExec::ScanAvailable; + m_pool_state = ThreadState::ScanAvailable; // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::ScanAvailable); + spinwait_while_equal(m_pool_state, ThreadState::ScanAvailable); } //-------------------------------- for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal<int>(fan.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(fan.m_pool_state, ThreadState::ReductionAvailable); // Set: ScanAvailable -> Rendezvous - fan.m_pool_state = ThreadsExec::Rendezvous; + fan.m_pool_state = ThreadState::Rendezvous; } // All threads have completed the inclusive scan. @@ -346,7 +314,7 @@ class ThreadsExec { if ((rev_rank + 1) < m_pool_size) { // Exclusive scan: copy the previous thread's inclusive scan value - ThreadsExec &th = *m_pool_base[rev_rank + 1]; // Not the root thread + ThreadsInternal &th = *m_pool_base[rev_rank + 1]; // Not the root thread const scalar_type *const src_value = ((scalar_type *)th.reduce_memory()) + count; @@ -362,19 +330,18 @@ class ThreadsExec { // Wait for all threads to copy previous thread's inclusive scan value // Wait for all threads: Rendezvous -> ScanCompleted for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal<int>( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Rendezvous); } if (rev_rank) { // Set: ScanAvailable -> ScanCompleted - m_pool_state = ThreadsExec::ScanCompleted; + m_pool_state = ThreadState::ScanCompleted; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::ScanCompleted); + spinwait_while_equal(m_pool_state, ThreadState::ScanCompleted); } // Set: ScanCompleted -> Active for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -391,8 +358,8 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal<int>( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } for (unsigned i = 0; i < count; ++i) { @@ -400,9 +367,9 @@ class ThreadsExec { } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the thread-scan before releasing threads @@ -424,7 +391,7 @@ class ThreadsExec { } for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -433,18 +400,14 @@ class ThreadsExec { * complete and release the Threads device. * Acquire the Threads device and start this functor. */ - static void start(void (*)(ThreadsExec &, const void *), const void *); + static void start(void (*)(ThreadsInternal &, const void *), const void *); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int in_parallel(); +#endif static void fence(); static void fence(const std::string &); - static void internal_fence( - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static void internal_fence( - const std::string &, - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static bool sleep(); - static bool wake(); + static void internal_fence(); /* Dynamic Scheduling related functionality */ // Initialize the work range for this thread @@ -583,30 +546,38 @@ class ThreadsExec { namespace Kokkos { -inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline int Threads::in_parallel() { + return Impl::ThreadsInternal::in_parallel(); +} +#endif inline int Threads::impl_is_initialized() { - return Impl::ThreadsExec::is_initialized(); + return Impl::ThreadsInternal::is_initialized(); } inline void Threads::impl_initialize(InitializationSettings const &settings) { - Impl::ThreadsExec::initialize( + Impl::ThreadsInternal::initialize( settings.has_num_threads() ? settings.get_num_threads() : -1); } -inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); } +inline void Threads::impl_finalize() { Impl::ThreadsInternal::finalize(); } inline void Threads::print_configuration(std::ostream &os, bool verbose) const { os << "Host Parallel Execution Space:\n"; os << " KOKKOS_ENABLE_THREADS: yes\n"; os << "\nThreads Runtime Configuration:\n"; - Impl::ThreadsExec::print_configuration(os, verbose); + Impl::ThreadsInternal::print_configuration(os, verbose); } inline void Threads::impl_static_fence(const std::string &name) { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes); + Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + Impl::ThreadsInternal::internal_fence); } } /* namespace Kokkos */ -#endif /* #define KOKKOS_THREADSEXEC_HPP */ +#endif diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 0828f262993cc13527d7ee900f7b3db1fbcc68d9..711b1b69261f9e3dbc8710c1f21be7a8976694a5 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -46,54 +46,54 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule<typename Policy::schedule_type::type>(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(instance, arg); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Static>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); self.exec_range(range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Dynamic>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = static_cast<Member>(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 3698416ef187dfcf6649b57ce8ac3d297fad1d05..25aab9ebfbc157077566254cf1d1860675d0f59e 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -35,7 +35,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, const Policy m_policy; template <class TagType> - inline static std::enable_if_t<std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -47,7 +47,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<!std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { const TagType t{}; #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -59,37 +59,37 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule<typename Policy::schedule_type::type>(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(instance, arg); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Static>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Dynamic>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = @@ -100,16 +100,16 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, ? begin + self.m_policy.chunk_size() : self.m_policy.end(); ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index 36404857a228beaff5208b9b5051be0b579a1dc0..40be3884c3d4a726f5829c5debf58a52333a495a 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -36,8 +36,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, const size_t m_shared; template <class TagType, class Schedule> - inline static std::enable_if_t<std::is_void<TagType>::value && - std::is_same<Schedule, Kokkos::Static>::value> + inline static std::enable_if_t<std::is_void_v<TagType> && + std::is_same_v<Schedule, Kokkos::Static>> exec_team(const FunctorType &functor, Member member) { for (; member.valid_static(); member.next_static()) { functor(member); @@ -45,8 +45,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, } template <class TagType, class Schedule> - inline static std::enable_if_t<!std::is_void<TagType>::value && - std::is_same<Schedule, Kokkos::Static>::value> + inline static std::enable_if_t<!std::is_void_v<TagType> && + std::is_same_v<Schedule, Kokkos::Static>> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -55,8 +55,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, } template <class TagType, class Schedule> - inline static std::enable_if_t<std::is_void<TagType>::value && - std::is_same<Schedule, Kokkos::Dynamic>::value> + inline static std::enable_if_t<std::is_void_v<TagType> && + std::is_same_v<Schedule, Kokkos::Dynamic>> exec_team(const FunctorType &functor, Member member) { for (; member.valid_dynamic(); member.next_dynamic()) { functor(member); @@ -64,8 +64,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, } template <class TagType, class Schedule> - inline static std::enable_if_t<!std::is_void<TagType>::value && - std::is_same<Schedule, Kokkos::Dynamic>::value> + inline static std::enable_if_t<!std::is_void_v<TagType> && + std::is_same_v<Schedule, Kokkos::Dynamic>> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_dynamic(); member.next_dynamic()) { @@ -73,14 +73,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>( - self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + self.m_functor, Member(&instance, self.m_policy, self.m_shared)); - exec.barrier(); - exec.fan_in(); + instance.barrier(); + instance.fan_in(); } template <typename Policy> Policy fix_policy(Policy policy) { @@ -88,20 +88,24 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); + int team_size = policy.team_size_recommended(m_functor, ParallelForTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor<Threads, TeamPolicy> could not find a " + "valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } public: inline void execute() const { - ThreadsExec::resize_scratch( + ThreadsInternal::resize_scratch( 0, Policy::member_type::team_reduce_size() + m_shared); - ThreadsExec::start(&ParallelFor::exec, this); + ThreadsInternal::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index 3d06379480f72686dd5773789762fe23fc74d519..9f28f9bbfcc2dc6fa65456775bc1fc7a942d76f8 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -54,67 +54,67 @@ class ParallelReduce<CombinedFunctorReducerType, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule<typename Policy::schedule_type::type>(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(instance, arg); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Static>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); self.exec_range( range.begin(), range.end(), - reducer.init(static_cast<pointer_type>(exec.reduce_memory()))); + reducer.init(static_cast<pointer_type>(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Dynamic>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); reference_type update = - self.m_reducer.init(static_cast<pointer_type>(exec.reduce_memory())); + reducer.init(static_cast<pointer_type>(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast<Member>(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(self.m_reducer); + instance.fan_in_reduce(reducer); } public: inline void execute() const { const ReducerType &reducer = m_iter.m_func.get_reducer(); - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp index 5fa97b403c4e767321bd2605649c3e9ec2636a48..4d108771b3c74fa56e9ee4629aea80f98651d9eb 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp @@ -42,7 +42,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, const pointer_type m_result_ptr; template <class TagType> - inline static std::enable_if_t<std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -55,7 +55,7 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<!std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { const TagType t{}; @@ -68,42 +68,44 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule<typename Policy::schedule_type::type>(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(instance, arg); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Static>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Static>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); ParallelReduce::template exec_range<WorkTag>( self.m_functor_reducer.get_functor(), range.begin(), range.end(), - reducer.init(static_cast<pointer_type>(exec.reduce_memory()))); + reducer.init(static_cast<pointer_type>(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template <class Schedule> - static std::enable_if_t<std::is_same<Schedule, Kokkos::Dynamic>::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + static std::enable_if_t<std::is_same_v<Schedule, Kokkos::Dynamic>> + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); reference_type update = - reducer.init(static_cast<pointer_type>(exec.reduce_memory())); + reducer.init(static_cast<pointer_type>(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size() + @@ -114,10 +116,10 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, : self.m_policy.end(); ParallelReduce::template exec_range<WorkTag>( self.m_functor_reducer.get_functor(), begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } public: @@ -130,15 +132,15 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>, reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index c4b6100a9df2ac5d9ac9bfb799ce84489c1e577b..69527ee3e65e7044228ced482711a121626de909 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -42,7 +42,7 @@ class ParallelReduce<CombinedFunctorReducerType, const size_t m_shared; template <class TagType> - inline static std::enable_if_t<std::is_void<TagType>::value> exec_team( + inline static std::enable_if_t<std::is_void_v<TagType>> exec_team( const FunctorType &functor, Member member, reference_type update) { for (; member.valid_static(); member.next_static()) { functor(member, update); @@ -50,7 +50,7 @@ class ParallelReduce<CombinedFunctorReducerType, } template <class TagType> - inline static std::enable_if_t<!std::is_void<TagType>::value> exec_team( + inline static std::enable_if_t<!std::is_void_v<TagType>> exec_team( const FunctorType &functor, Member member, reference_type update) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -58,16 +58,16 @@ class ParallelReduce<CombinedFunctorReducerType, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); ParallelReduce::template exec_team<WorkTag>( self.m_functor_reducer.get_functor(), - Member(&exec, self.m_policy, self.m_shared), + Member(&instance, self.m_policy, self.m_shared), self.m_functor_reducer.get_reducer().init( - static_cast<pointer_type>(exec.reduce_memory()))); + static_cast<pointer_type>(instance.reduce_memory()))); - exec.fan_in_reduce(self.m_functor_reducer.get_reducer()); + instance.fan_in_reduce(self.m_functor_reducer.get_reducer()); } public: @@ -80,17 +80,17 @@ class ParallelReduce<CombinedFunctorReducerType, reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch( + ThreadsInternal::resize_scratch( reducer.value_size(), Policy::member_type::team_reduce_size() + m_shared); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { @@ -106,9 +106,14 @@ class ParallelReduce<CombinedFunctorReducerType, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size(policy.team_size_recommended( + int team_size = policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), - ParallelReduceTag{})); + ParallelReduceTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce<Threads, TeamPolicy> could not find " + "a valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp index 74d8561a34b7addf740a8d02d03f7f04a12aca25..d54f4ca952e62e55726064295d80328efaf9da7f 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -39,7 +39,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, const Policy m_policy; template <class TagType> - inline static std::enable_if_t<std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -52,7 +52,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<!std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -65,33 +65,33 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScan &self = *((const ParallelScan *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast<pointer_type>(exec.reduce_memory())); + final_reducer.init(static_cast<pointer_type>(instance.reduce_memory())); ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large( final_reducer ); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScan::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScan::exec, this); + ThreadsInternal::fence(); } ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) @@ -119,7 +119,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, const pointer_type m_result_ptr; template <class TagType> - inline static std::enable_if_t<std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -132,7 +132,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, } template <class TagType> - inline static std::enable_if_t<!std::is_void<TagType>::value> exec_range( + inline static std::enable_if_t<!std::is_void_v<TagType>> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -145,37 +145,37 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast<pointer_type>(exec.reduce_memory())); + final_reducer.init(static_cast<pointer_type>(instance.reduce_memory())); ParallelScanWithTotal::template exec_range<WorkTag>( self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large(final_reducer); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScanWithTotal::template exec_range<WorkTag>( self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); - if (exec.pool_rank() == exec.pool_size() - 1) { + if (instance.pool_rank() == instance.pool_size() - 1) { *self.m_result_ptr = update; } } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScanWithTotal::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScanWithTotal::exec, this); + ThreadsInternal::fence(); } template <class ViewType> diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp similarity index 90% rename from packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 0a7eda29bcf1b8862e8b97166865d995662ee893..0f9a77f2afa9ab2a2d5d4db70b32a2016b29793e 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -21,7 +21,7 @@ #include <Kokkos_Macros.hpp> #include <Kokkos_Atomic.hpp> -#include <impl/Kokkos_Spinwait.hpp> +#include <Threads/Kokkos_Threads_Spinwait.hpp> #include <impl/Kokkos_BitOps.hpp> #include <thread> @@ -108,5 +108,15 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } +void spinwait_while_equal(std::atomic<ThreadState> const& flag, + ThreadState const value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7ab43cdb7af6feee1e758b1c66d9dbb9a0ac60ae --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -0,0 +1,44 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_THREADS_SPINWAIT_HPP +#define KOKKOS_THREADS_SPINWAIT_HPP + +#include <Threads/Kokkos_Threads_State.hpp> + +#include <cstdint> +#include <atomic> + +namespace Kokkos { +namespace Impl { + +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , + PASSIVE // Used to quickly yield the thread to quite down the system + , + ROOT // Never sleep or yield the root thread +}; + +void host_thread_yield(const uint32_t i, const WaitMode mode); + +void spinwait_while_equal(std::atomic<ThreadState> const& flag, + ThreadState const value); + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_State.hpp similarity index 59% rename from packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_State.hpp index 21ba7fad01cf0be4e2b1857a15860a8b3e4be74b..148e9aa4e05790551e5fa3c0d671d99c48068bbc 100644 --- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_State.hpp @@ -14,16 +14,26 @@ // //@HEADER -#ifndef KOKKOS_HBWSPACE_FWD_HPP_ -#define KOKKOS_HBWSPACE_FWD_HPP_ +#ifndef KOKKOS_THREADS_STATE_HPP +#define KOKKOS_THREADS_STATE_HPP -#ifdef KOKKOS_ENABLE_HBWSPACE namespace Kokkos { - -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} // namespace Experimental +namespace Impl { +/** \brief States of a worker thread */ +enum class ThreadState { + Terminating ///< Termination in progress + , + Inactive ///< Exists, waiting for work + , + Active ///< Exists, performing work + , + Rendezvous ///< Exists, waiting in a barrier or reduce + , + ScanCompleted, + ScanAvailable, + ReductionAvailable +}; +} // namespace Impl } // namespace Kokkos -#endif + #endif diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp similarity index 84% rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index b1cadc7c485d65b16cc5cfc61d99a50ffecfb67e..f627e0d47a51e06a4e2d60e154b370fbdf4405f5 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -22,10 +22,11 @@ #include <cstdio> #include <utility> -#include <impl/Kokkos_Spinwait.hpp> #include <impl/Kokkos_HostThreadTeam.hpp> #include <Kokkos_Atomic.hpp> +#include <Threads/Kokkos_Threads_Spinwait.hpp> +#include <Threads/Kokkos_Threads_State.hpp> //---------------------------------------------------------------------------- @@ -50,8 +51,8 @@ class ThreadsExecTeamMember { private: using space = execution_space::scratch_memory_space; - ThreadsExec* const m_exec; - ThreadsExec* const* m_team_base; ///< Base for team fan-in + ThreadsInternal* const m_instance; + ThreadsInternal* const* m_team_base; ///< Base for team fan-in space m_team_shared; size_t m_team_shared_size; int m_team_size; @@ -84,14 +85,13 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - Impl::spinwait_while_equal<int>(m_team_base[j]->state(), - ThreadsExec::Active); + spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); } // If not root then wait for release if (m_team_rank_rev) { - m_exec->state() = ThreadsExec::Rendezvous; - Impl::spinwait_while_equal<int>(m_exec->state(), ThreadsExec::Rendezvous); + m_instance->state() = ThreadState::Rendezvous; + spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); } return !m_team_rank_rev; @@ -102,7 +102,7 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - m_team_base[j]->state() = ThreadsExec::Active; + m_team_base[j]->state() = ThreadState::Active; } } @@ -143,8 +143,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c<sizeof(ValueType) < TEAM_REDUCE_SIZE, - ValueType, void>::type; + using type = std::conditional_t<sizeof(ValueType) < TEAM_REDUCE_SIZE, + ValueType, void>; if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); @@ -164,8 +164,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c<sizeof(ValueType) < TEAM_REDUCE_SIZE, - ValueType, void>::type; + using type = std::conditional_t<sizeof(ValueType) < TEAM_REDUCE_SIZE, + ValueType, void>; f(value); if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); memory_fence(); @@ -186,12 +186,10 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: using type = - typename if_c<sizeof(Type) < TEAM_REDUCE_SIZE, Type, void>::type; - - if (nullptr == m_exec) return value; + std::conditional_t<sizeof(Type) < TEAM_REDUCE_SIZE, Type, void>; if (team_rank() != team_size() - 1) * - ((volatile type*)m_exec->scratch_memory()) = value; + ((volatile type*)m_instance->scratch_memory()) = value; memory_fence(); @@ -217,54 +215,65 @@ class ThreadsExecTeamMember { } template <typename ReducerType> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<Kokkos::is_reducer<ReducerType>::value> - team_reduce(const ReducerType& reducer, - const typename ReducerType::value_type contribution) const { + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + team_reduce(const ReducerType& reducer, + typename ReducerType::value_type& contribution) const { KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;)) - KOKKOS_IF_ON_HOST(( - using value_type = typename ReducerType::value_type; - // Make sure there is enough scratch space: - using type = typename if_c<sizeof(value_type) < TEAM_REDUCE_SIZE, - value_type, void>::type; - - if (nullptr == m_exec) return; + KOKKOS_IF_ON_HOST( + (using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy<Threads>, + ReducerType, value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution;)) + } - type* const local_value = ((type*)m_exec->scratch_memory()); + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<WrappedReducerType>::value> + impl_team_reduce( + const WrappedReducerType& wrapped_reducer, + typename WrappedReducerType::value_type& contribution) const { + using value_type = typename WrappedReducerType::value_type; + // Make sure there is enough scratch space: + using type = std::conditional_t<sizeof(value_type) < TEAM_REDUCE_SIZE, + value_type, void>; + + type* const local_value = ((type*)m_instance->scratch_memory()); + + // Set this thread's contribution + if (team_rank() != team_size() - 1) { + *local_value = contribution; + } - // Set this thread's contribution - if (team_rank() != team_size() - 1) { *local_value = contribution; } + // Fence to make sure the base team member has access: + memory_fence(); - // Fence to make sure the base team member has access: - memory_fence(); + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads + // wait for team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - if (team_fan_in()) { - // The last thread to synchronize returns true, all other threads - // wait for team_fan_out() - type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - - *team_value = contribution; - // Join to the team value: - for (int i = 1; i < m_team_size; ++i) { - reducer.join(*team_value, - *((type*)m_team_base[i]->scratch_memory())); - } + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + wrapped_reducer.join(team_value, + ((type*)m_team_base[i]->scratch_memory())); + } - // Team base thread may "lap" member threads so copy out to their - // local value. - for (int i = 1; i < m_team_size; ++i) { - *((type*)m_team_base[i]->scratch_memory()) = *team_value; - } + // Team base thread may "lap" member threads so copy out to their + // local value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } - // Fence to make sure all team members have access - memory_fence(); - } + // Fence to make sure all team members have access + memory_fence(); + } - team_fan_out(); + team_fan_out(); - // Value was changed by the team base - reducer.reference() = *local_value;)) + contribution = *local_value; } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -282,12 +291,10 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_DEVICE(((void)global_accum; return value;)) KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c<sizeof(ArgType) < TEAM_REDUCE_SIZE, ArgType, - void>::type; - - if (nullptr == m_exec) return type(0); + using type = std::conditional_t<sizeof(ArgType) < TEAM_REDUCE_SIZE, + ArgType, void>; - volatile type* const work_value = ((type*)m_exec->scratch_memory()); + volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -342,10 +349,10 @@ class ThreadsExecTeamMember { template <class... Properties> ThreadsExecTeamMember( - Impl::ThreadsExec* exec, + Impl::ThreadsInternal* instance, const TeamPolicyInternal<Kokkos::Threads, Properties...>& team, const size_t shared_size) - : m_exec(exec), + : m_instance(instance), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(shared_size), @@ -358,12 +365,15 @@ class ThreadsExecTeamMember { m_chunk_size(team.chunk_size()), m_league_chunk_end(0), m_team_alloc(team.team_alloc()) { + KOKKOS_ASSERT(m_instance != nullptr); if (team.league_size()) { // Execution is using device-team interface: - const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1); + const int pool_rank_rev = + m_instance->pool_size() - (m_instance->pool_rank() + 1); const int team_rank_rev = pool_rank_rev % team.team_alloc(); - const size_t pool_league_size = m_exec->pool_size() / team.team_alloc(); + const size_t pool_league_size = + m_instance->pool_size() / team.team_alloc(); const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc(); if (pool_league_rank_rev >= pool_league_size) { m_invalid_thread = 1; @@ -372,7 +382,7 @@ class ThreadsExecTeamMember { const size_t pool_league_rank = pool_league_size - (pool_league_rank_rev + 1); - const int pool_num_teams = m_exec->pool_size() / team.team_alloc(); + const int pool_num_teams = m_instance->pool_size() / team.team_alloc(); const int chunk_size = team.chunk_size() > 0 ? team.chunk_size() : team.team_iter(); const int chunks_per_team = @@ -387,8 +397,8 @@ class ThreadsExecTeamMember { if ((team.team_alloc() > size_t(m_team_size)) ? (team_rank_rev >= m_team_size) - : (m_exec->pool_size() - pool_num_teams * m_team_size > - m_exec->pool_rank())) + : (m_instance->pool_size() - pool_num_teams * m_team_size > + m_instance->pool_rank())) m_invalid_thread = 1; else m_invalid_thread = 0; @@ -398,7 +408,7 @@ class ThreadsExecTeamMember { if (team_rank_rev < team.team_size() && !m_invalid_thread) { m_team_base = - m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev; + m_instance->pool_base() + team.team_alloc() * pool_league_rank_rev; m_team_size = team.team_size(); m_team_rank = team.team_size() - (team_rank_rev + 1); m_team_rank_rev = team_rank_rev; @@ -413,13 +423,13 @@ class ThreadsExecTeamMember { } if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) { - m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size); - m_exec->reset_steal_target(m_team_size); + m_instance->set_work_range(m_league_rank, m_league_end, m_chunk_size); + m_instance->reset_steal_target(m_team_size); } if (std::is_same<typename TeamPolicyInternal< Kokkos::Threads, Properties...>::schedule_type::type, Kokkos::Dynamic>::value) { - m_exec->barrier(); + m_instance->barrier(); } } else { m_invalid_thread = 1; @@ -427,7 +437,7 @@ class ThreadsExecTeamMember { } ThreadsExecTeamMember() - : m_exec(nullptr), + : m_instance(nullptr), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(0), @@ -442,8 +452,8 @@ class ThreadsExecTeamMember { m_invalid_thread(0), m_team_alloc(0) {} - inline ThreadsExec& threads_exec_team_base() const { - return m_team_base ? **m_team_base : *m_exec; + inline ThreadsInternal& threads_exec_team_base() const { + return m_team_base ? **m_team_base : *m_instance; } bool valid_static() const { return m_league_rank < m_league_end; } @@ -890,19 +900,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType intermediate; - Sum<ValueType> sum(intermediate); - sum.init(intermediate); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::ThreadsExecTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - ValueType tmp = ValueType(); - lambda(i, tmp); - intermediate += tmp; + lambda(i, value); } - loop_boundaries.thread.team_reduce(sum, intermediate); - result = sum.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } template <typename iType, class Lambda, typename ReducerType> @@ -910,15 +926,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::ThreadsExecTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { lambda(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } } // namespace Kokkos @@ -953,11 +979,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<!Kokkos::is_reducer<ValueType>::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::ThreadsExecTeamMember::execution_space>, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template <typename iType, class Lambda, typename ReducerType> @@ -965,11 +1004,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<typename Impl::ThreadsExecTeamMember::execution_space>, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel exclusive prefix sum. Executes @@ -999,8 +1051,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } + auto& team_member = loop_bounds.thread; + // 'scan_val' output is the exclusive prefix sum - scan_val = loop_bounds.thread.team_scan(scan_val); + scan_val = team_member.team_scan(scan_val); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1010,6 +1064,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, true); } + team_member.team_broadcast(scan_val, team_member.team_size() - 1); + return_val = scan_val; } @@ -1048,7 +1104,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN, TeamPolicy<Threads>, FunctorType, void>::value_type; - static_assert(std::is_same<closure_value_type, ValueType>::value, + static_assert(std::is_same_v<closure_value_type, ValueType>, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index d4ce697548fab07735f363c9026d1f2d8164eb3e..5fed92db26de6c1b33462fe9a2469c48c01a229e 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include <Kokkos_Core_fwd.hpp> -#include <Threads/Kokkos_ThreadsExec.hpp> +#include <Threads/Kokkos_Threads_Instance.hpp> namespace Kokkos { namespace Impl { @@ -36,13 +36,13 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, FunctorType m_functor; template <class TagType> - std::enable_if_t<std::is_void<TagType>::value> exec_one( + std::enable_if_t<std::is_void_v<TagType>> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template <class TagType> - std::enable_if_t<!std::is_void<TagType>::value> exec_one( + std::enable_if_t<!std::is_void_v<TagType>> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); @@ -61,16 +61,17 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, } } - static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept { + static inline void thread_main(ThreadsInternal& instance, + const void* arg) noexcept { const Self& self = *(static_cast<const Self*>(arg)); self.exec_one_thread(); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() { - ThreadsExec::start(&Self::thread_main, this); - ThreadsExec::fence(); + ThreadsInternal::start(&Self::thread_main, this); + ThreadsInternal::fence(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/packages/kokkos/core/src/View/Kokkos_BasicView.hpp b/packages/kokkos/core/src/View/Kokkos_BasicView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..29eafca62eef00c474bfe9a7bc7d905439524ce8 --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -0,0 +1,652 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_BASIC_VIEW_HPP +#define KOKKOS_BASIC_VIEW_HPP +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Utilities.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <View/Kokkos_ViewAlloc.hpp> +#include <View/Kokkos_ViewCtor.hpp> +#include <View/Kokkos_ViewTraits.hpp> +#include <View/MDSpan/Kokkos_MDSpan_Header.hpp> +#include <View/MDSpan/Kokkos_MDSpan_Accessor.hpp> +#include <View/MDSpan/Kokkos_MDSpan_Layout.hpp> + +#include <optional> +#include <type_traits> + +// FIXME: we need to make this work for not using our mdspan impl +#define KOKKOS_IMPL_NO_UNIQUE_ADDRESS _MDSPAN_NO_UNIQUE_ADDRESS +namespace Kokkos::Impl { + +constexpr inline struct SubViewCtorTag { + explicit SubViewCtorTag() = default; +} subview_ctor_tag{}; + +template <class T> +struct KokkosSliceToMDSpanSliceImpl { + using type = T; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(const T &s) { return s; } +}; + +template <> +struct KokkosSliceToMDSpanSliceImpl<Kokkos::ALL_t> { + using type = full_extent_t; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(Kokkos::ALL_t) { + return full_extent; + } +}; + +template <class T> +using kokkos_slice_to_mdspan_slice = + typename KokkosSliceToMDSpanSliceImpl<T>::type; + +template <class T> +KOKKOS_INLINE_FUNCTION constexpr decltype(auto) +transform_kokkos_slice_to_mdspan_slice(const T &s) { + return KokkosSliceToMDSpanSliceImpl<T>::transform(s); +} + +// We do have implementation detail versions of these in our mdspan impl +// However they are not part of the public standard interface +template <class T> +struct is_layout_right_padded : public std::false_type {}; + +template <size_t Pad> +struct is_layout_right_padded<Kokkos::Experimental::layout_right_padded<Pad>> + : public std::true_type {}; + +template <class T> +struct is_layout_left_padded : public std::false_type {}; + +template <size_t Pad> +struct is_layout_left_padded<Kokkos::Experimental::layout_left_padded<Pad>> + : public std::true_type {}; + +template <class ElementType, class Extents, class LayoutPolicy, + class AccessorPolicy> +class BasicView { + public: + using mdspan_type = + mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>; + using extents_type = typename mdspan_type::extents_type; + using layout_type = typename mdspan_type::layout_type; + using accessor_type = typename mdspan_type::accessor_type; + using mapping_type = typename mdspan_type::mapping_type; + using element_type = typename mdspan_type::element_type; + using value_type = typename mdspan_type::value_type; + using index_type = typename mdspan_type::index_type; + using size_type = typename mdspan_type::size_type; + using rank_type = typename mdspan_type::rank_type; + using data_handle_type = typename mdspan_type::data_handle_type; + using reference = typename mdspan_type::reference; + using memory_space = typename accessor_type::memory_space; + using execution_space = typename memory_space::execution_space; + + // For now View and BasicView will have a restriction that the data handle + // needs to be convertible to element_type* and vice versa + static_assert(std::is_constructible_v<element_type *, data_handle_type>); + static_assert(std::is_constructible_v<data_handle_type, element_type *>); + + KOKKOS_FUNCTION static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + KOKKOS_FUNCTION static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + KOKKOS_FUNCTION static constexpr size_t static_extent(rank_type r) noexcept { + return extents_type::static_extent(r); + } + KOKKOS_FUNCTION constexpr index_type extent(rank_type r) const noexcept { + return m_map.extents().extent(r); + }; + + protected: + // These are pre-condition checks which are unconditionally (i.e. in release + // mode) enabled in Kokkos::View 4.4 + template <class OtherMapping> + KOKKOS_FUNCTION static constexpr void check_basic_view_constructibility( + [[maybe_unused]] const OtherMapping &rhs) { + using src_t = typename OtherMapping::layout_type; + using dst_t = layout_type; + constexpr size_t rnk = mdspan_type::rank(); + if constexpr (!std::is_same_v<src_t, dst_t>) { + if constexpr (Impl::is_layout_left_padded<dst_t>::value) { + if constexpr (std::is_same_v<src_t, layout_stride>) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == 0 ? rhs.stride(1) : rhs.extents().extent(r)); + } + } + } + if constexpr (Impl::is_layout_right_padded<dst_t>::value) { + if constexpr (std::is_same_v<src_t, layout_stride>) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == rnk ? rhs.stride(r - 2) + : rhs.extents().extent(r - 1)); + } + } + } + } + if constexpr (std::is_same_v<dst_t, layout_left>) { + if constexpr (std::is_same_v<src_t, layout_stride>) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r); + } + } else if constexpr (Impl::is_layout_left_padded<src_t>::value && + rnk > 1) { + if (rhs.stride(1) != rhs.extents().extent(0)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + if constexpr (std::is_same_v<dst_t, layout_right>) { + if constexpr (std::is_same_v<src_t, layout_stride>) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r - 1); + } + } + } else if constexpr (Impl::is_layout_right_padded<src_t>::value && + rnk > 1) { + if (rhs.stride(rnk - 2) != rhs.extents().extent(rnk - 1)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + } + } + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr BasicView() = default; + + KOKKOS_FUNCTION constexpr BasicView(const mdspan_type &other) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()){}; + KOKKOS_FUNCTION constexpr BasicView(mdspan_type &&other) + : m_ptr(std::move(other.data_handle())), + m_map(std::move(other.mapping())), + m_acc(std::move(other.accessor())){}; + + template <class... OtherIndexTypes> + // requires(std::is_constructible_v<mdspan_type, data_handle_type, + // OtherIndexTypes...>) + KOKKOS_FUNCTION explicit constexpr BasicView( + std::enable_if_t<std::is_constructible_v<mdspan_type, data_handle_type, + OtherIndexTypes...>, + data_handle_type> + p, + OtherIndexTypes... exts) + : m_ptr(std::move(p)), + m_map(extents_type(static_cast<index_type>(std::move(exts))...)), + m_acc{} {} + + template <class OtherIndexType, size_t Size> + // When doing C++20 we should switch to this, the conditional explicit we + // can't do in 17 + // requires(std::is_constructible_v<mdspan_type, data_handle_type, + // std::array<OtherIndexType, Size>>) + // explicit(Size != rank_dynamic()) + KOKKOS_FUNCTION constexpr BasicView( + std::enable_if_t< + std::is_constructible_v<mdspan_type, data_handle_type, + std::array<OtherIndexType, Size>>, + data_handle_type> + p, + const Array<OtherIndexType, Size> &exts) + : m_ptr(std::move(p)), m_map(extents_type(exts)), m_acc{} {} + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, + const extents_type &exts) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v<accessor_type> && + std::is_constructible_v<mapping_type, const extents_type &>) +#endif + : m_ptr(std::move(p)), m_map(exts), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v<accessor_type>) +#endif + : m_ptr(std::move(p)), m_map(m), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m, + const accessor_type &a) + : m_ptr(std::move(p)), m_map(m), m_acc(a) {} + + template <class OtherT, class OtherE, class OtherL, class OtherA> +// requires(std::is_constructible_v<mdspan_type, +// typename BasicView<OtherT, OtherE, +// OtherL, +// OtherA>::mdspan_type>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v<const typename OtherL::template mapping<OtherE> &, + mapping_type> || + !std::is_convertible_v<const OtherA &, accessor_type>) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const BasicView<OtherT, OtherE, OtherL, OtherA> &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, typename BasicView<OtherT, OtherE, OtherL, + OtherA>::mdspan_type>, + void *> = nullptr) + : m_ptr(other.m_ptr), m_map(other.m_map), m_acc(other.m_acc) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v<data_handle_type, + const typename OtherA::data_handle_type &>, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v<extents_type, OtherE>, + "Kokkos::View: incompatible extents for View construction"); + } + + template <class OtherT, class OtherE, class OtherL, class OtherA> +// requires(std::is_constructible_v<mdspan_type, +// mdspan<OtherT, OtherE, OtherL, OtherA>>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v<const typename OtherL::template mapping<OtherE> &, + mapping_type> || + !std::is_convertible_v<const OtherA &, accessor_type>) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const mdspan<OtherT, OtherE, OtherL, OtherA> &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, mdspan<OtherT, OtherE, OtherL, OtherA>>, + void *> = nullptr) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v<data_handle_type, + const typename OtherA::data_handle_type &>, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v<extents_type, OtherE>, + "Kokkos::View: incompatible extents for View construction"); + } + + // Allocating constructors specific to BasicView + /// + /// Construct from a given mapping + /// + explicit constexpr BasicView(const std::string &label, + const mapping_type &mapping) + : BasicView(view_alloc(label), mapping) {} + + /// + /// Construct from a given extents + /// + explicit constexpr BasicView(const std::string &label, + const extents_type &ext) + : BasicView(view_alloc(label), mapping_type{ext}) {} + + private: + template <class... P> + data_handle_type create_data_handle( + const Impl::ViewCtorProp<P...> &arg_prop, + const typename mdspan_type::mapping_type &arg_mapping) { + constexpr bool has_exec = Impl::ViewCtorProp<P...>::has_execution_space; + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, memory_space{}, execution_space{}); + using alloc_prop = decltype(prop_copy); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + return data_handle_type(Impl::make_shared_allocation_record<ElementType>( + arg_mapping.required_span_size(), + Impl::get_property<Impl::LabelTag>(prop_copy), + Impl::get_property<Impl::MemorySpaceTag>(prop_copy), + has_exec ? std::optional<execution_space>{Impl::get_property< + Impl::ExecutionSpaceTag>(prop_copy)} + : std::optional<execution_space>{std::nullopt}, + std::integral_constant<bool, alloc_prop::initialize>(), + std::integral_constant<bool, alloc_prop::sequential_host_init>())); + } + + public: + template <class... P> + // requires(!Impl::ViewCtorProp<P...>::has_pointer) + explicit inline BasicView( + const Impl::ViewCtorProp<P...> &arg_prop, + std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView(create_data_handle(arg_prop, arg_mapping), arg_mapping) {} + + template <class... P> + // requires(Impl::ViewCtorProp<P...>::has_pointer) + KOKKOS_FUNCTION explicit inline BasicView( + const Impl::ViewCtorProp<P...> &arg_prop, + std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView( + data_handle_type(Impl::get_property<Impl::PointerTag>(arg_prop)), + arg_mapping) {} + + protected: + template <class OtherElementType, class OtherExtents, class OtherLayoutPolicy, + class OtherAccessorPolicy, class... SliceSpecifiers> + KOKKOS_INLINE_FUNCTION BasicView( + Impl::SubViewCtorTag, + const BasicView<OtherElementType, OtherExtents, OtherLayoutPolicy, + OtherAccessorPolicy> &src_view, + SliceSpecifiers... slices) + : BasicView(submdspan( + src_view.to_mdspan(), + Impl::transform_kokkos_slice_to_mdspan_slice(slices)...)) {} + + public: + //---------------------------------------- + // Conversion to MDSpan + template <class OtherElementType, class OtherExtents, class OtherLayoutPolicy, + class OtherAccessor, + typename = std::enable_if_t< + std::is_assignable_v<mdspan<OtherElementType, OtherExtents, + OtherLayoutPolicy, OtherAccessor>, + mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr + operator mdspan<OtherElementType, OtherExtents, OtherLayoutPolicy, + OtherAccessor>() const { + return mdspan_type(m_ptr, m_map, m_acc); + } + + // Here we use an overload instead of a default parameter as a workaround + // to a potential compiler bug with clang 17. It may be present in other + // compilers + template <class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t<std::is_constructible_v< + typename mdspan_type::data_handle_type, + typename OtherAccessorType::data_handle_type>>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan() const { + using ret_mdspan_type = + mdspan<typename mdspan_type::element_type, + typename mdspan_type::extents_type, + typename mdspan_type::layout_type, OtherAccessorType>; + return ret_mdspan_type( + static_cast<typename OtherAccessorType::data_handle_type>( + data_handle()), + mapping(), static_cast<OtherAccessorType>(accessor())); + } + + template < + class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t<std::is_assignable_v< + data_handle_type, typename OtherAccessorType::data_handle_type>>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType &other_accessor) const { + using ret_mdspan_type = + mdspan<element_type, extents_type, layout_type, OtherAccessorType>; + return ret_mdspan_type( + static_cast<typename OtherAccessorType::data_handle_type>( + data_handle()), + mapping(), other_accessor); + } + + KOKKOS_FUNCTION void assign_data(element_type *ptr) { m_ptr = ptr; } + + // ========================= mdspan ================================= + + // [mdspan.mdspan.members], members + +// Introducing the C++20 and C++23 variants of the operators already +#ifndef KOKKOS_ENABLE_CXX17 +#ifndef KOKKOS_ENABLE_CXX20 + // C++23 only operator[] + template <class... OtherIndexTypes> + requires((std::is_convertible_v<OtherIndexTypes, index_type> && ...) && + (std::is_nothrow_constructible_v<index_type, OtherIndexTypes> && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator[]( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast<index_type>(std::move(indices))...)); + } + + template <class OtherIndexType> + requires( + std::is_convertible_v<const OtherIndexType &, index_type> && + std::is_nothrow_constructible_v<index_type, const OtherIndexType &>) + KOKKOS_FUNCTION constexpr reference operator[]( + const Array<OtherIndexType, rank()> &indices) const { + return m_acc.access(m_ptr, + [&]<size_t... Idxs>(std::index_sequence<Idxs...>) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence<rank()>())); + } + + template <class OtherIndexType> + requires( + std::is_convertible_v<const OtherIndexType &, index_type> && + std::is_nothrow_constructible_v<index_type, const OtherIndexType &>) + KOKKOS_FUNCTION constexpr reference operator[]( + std::span<OtherIndexType, rank()> indices) const { + return m_acc.access(m_ptr, + [&]<size_t... Idxs>(std::index_sequence<Idxs...>) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence<rank()>())); + } +#endif + + // C++20 operator() + template <class... OtherIndexTypes> + requires((std::is_convertible_v<OtherIndexTypes, index_type> && ...) && + (std::is_nothrow_constructible_v<index_type, OtherIndexTypes> && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast<index_type>(std::move(indices))...)); + } + + template <class OtherIndexType> + requires( + std::is_convertible_v<const OtherIndexType &, index_type> && + std::is_nothrow_constructible_v<index_type, const OtherIndexType &>) + KOKKOS_FUNCTION constexpr reference operator()( + const Array<OtherIndexType, rank()> &indices) const { + return m_acc.access(m_ptr, + [&]<size_t... Idxs>(std::index_sequence<Idxs...>) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence<rank()>())); + } + + template <class OtherIndexType> + requires( + std::is_convertible_v<const OtherIndexType &, index_type> && + std::is_nothrow_constructible_v<index_type, const OtherIndexType &>) + KOKKOS_FUNCTION constexpr reference operator()( + std::span<OtherIndexType, rank()> indices) const { + return m_acc.access(m_ptr, + [&]<size_t... Idxs>(std::index_sequence<Idxs...>) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence<rank()>())); + } +#else + // C++17 variant of operator() + + // Some weird unexplained issue in compiling the SFINAE version with CUDA/MSVC + // So we just use post factor check here with static_assert +#if defined(KOKKOS_ENABLE_CUDA) && defined(_WIN32) + template <class... OtherIndexTypes> + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + static_assert((std::is_convertible_v<OtherIndexTypes, index_type> && ...)); + static_assert( + (std::is_nothrow_constructible_v<index_type, OtherIndexTypes> && ...)); + static_assert((sizeof...(OtherIndexTypes)) == rank()); + return m_acc.access(m_ptr, + m_map(static_cast<index_type>(std::move(indices))...)); + } +#else + template <class... OtherIndexTypes> + KOKKOS_FUNCTION constexpr std::enable_if_t< + ((std::is_convertible_v<OtherIndexTypes, index_type> && ...)) && + ((std::is_nothrow_constructible_v<index_type, OtherIndexTypes> && + ...)) && + ((sizeof...(OtherIndexTypes)) == rank()), + reference> + operator()(OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast<index_type>(std::move(indices))...)); + } +#endif +#endif + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside size() + template <size_t... Idxs> + KOKKOS_FUNCTION constexpr size_type size_impl( + std::index_sequence<Idxs...>) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return 0u; + return ((static_cast<size_type>(m_map.extents().extent(Idxs))) * ... * + size_type(1)); + } + + public: + KOKKOS_FUNCTION constexpr size_type size() const noexcept { + return size_impl(std::make_index_sequence<rank()>()); + } + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside empty() + template <size_t... Idxs> + KOKKOS_FUNCTION constexpr bool empty_impl( + std::index_sequence<Idxs...>) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return true; + return (rank() > 0) && + ((m_map.extents().extent(Idxs) == index_type(0)) || ... || false); + } + + public: + [[nodiscard]] KOKKOS_FUNCTION constexpr bool empty() const noexcept { + return empty_impl(std::make_index_sequence<rank()>()); + } + + KOKKOS_FUNCTION friend constexpr void swap(BasicView &x, + BasicView &y) noexcept { + kokkos_swap(x.m_ptr, y.m_ptr); + kokkos_swap(x.m_map, y.m_map); + kokkos_swap(x.m_acc, y.m_acc); + } + + KOKKOS_FUNCTION constexpr const extents_type &extents() const noexcept { + return m_map.extents(); + }; + KOKKOS_FUNCTION constexpr const data_handle_type &data_handle() + const noexcept { + return m_ptr; + }; + KOKKOS_FUNCTION constexpr const mapping_type &mapping() const noexcept { + return m_map; + }; + KOKKOS_FUNCTION constexpr const accessor_type &accessor() const noexcept { + return m_acc; + }; + + KOKKOS_FUNCTION static constexpr bool is_always_unique() noexcept { + return mapping_type::is_always_unique(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return mapping_type::is_always_exhaustive(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_strided() noexcept { + return mapping_type::is_always_strided(); + }; + + KOKKOS_FUNCTION constexpr bool is_unique() const { + return m_map.is_unique(); + }; + KOKKOS_FUNCTION constexpr bool is_exhaustive() const { + return m_map.is_exhaustive(); + }; + KOKKOS_FUNCTION constexpr bool is_strided() const { + return m_map.is_strided(); + }; + KOKKOS_FUNCTION constexpr index_type stride(rank_type r) const { + return m_map.stride(r); + }; + + protected: +#ifndef __NVCC__ + KOKKOS_IMPL_NO_UNIQUE_ADDRESS data_handle_type m_ptr{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS mapping_type m_map{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS accessor_type m_acc{}; +#else + data_handle_type m_ptr{}; + mapping_type m_map{}; + accessor_type m_acc{}; +#endif + + template <class, class, class, class> + friend class BasicView; +}; +} // namespace Kokkos::Impl + +#endif diff --git a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eb11630b21b8f759a0cb922467f0a755591b7b18 --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -0,0 +1,305 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include <Kokkos_Macros.hpp> +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_ALLOC_HPP +#define KOKKOS_VIEW_ALLOC_HPP + +#include <cstring> +#include <type_traits> +#include <string> +#include <optional> + +#include <impl/Kokkos_Tools.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <impl/Kokkos_ZeroMemset_fwd.hpp> + +namespace Kokkos::Impl { + +template <typename T> +bool is_zero_byte(const T& x) { + constexpr std::byte all_zeroes[sizeof(T)] = {}; + return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; +} + +template <class DeviceType, class ValueType> +struct ViewValueFunctor { + using ExecSpace = typename DeviceType::execution_space; + + struct DestroyTag {}; + struct ConstructTag {}; + + ExecSpace space; + ValueType* ptr; + size_t n; + std::string name; + bool default_exec_space; + + template <class SameValueType = ValueType> + KOKKOS_FUNCTION + std::enable_if_t<std::is_default_constructible_v<SameValueType>> + operator()(ConstructTag, const size_t i) const { + new (ptr + i) ValueType(); + } + + KOKKOS_FUNCTION void operator()(DestroyTag, const size_t i) const { + // When instantiating a View on host execution space with a host only + // destructor the workaround for CUDA device symbol instantiation tries to + // still compile a destruction kernel for the device, and issues a warning + // for host from host-device +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v<ExecSpace, Cuda>) { + KOKKOS_IF_ON_DEVICE(((ptr + i)->~ValueType();)) + } else { + KOKKOS_IF_ON_HOST(((ptr + i)->~ValueType();)) + } +#else + (ptr + i)->~ValueType(); +#endif + } + + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor& operator=(const ViewValueFunctor&) = default; + + ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, + size_t const arg_n, std::string arg_name) + : space(arg_space), + ptr(arg_ptr), + n(arg_n), + name(std::move(arg_name)), + default_exec_space(false) { + functor_instantiate_workaround(); + } + + ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, + std::string arg_name) + : space(ExecSpace{}), + ptr(arg_ptr), + n(arg_n), + name(std::move(arg_name)), + default_exec_space(true) { + functor_instantiate_workaround(); + } + + template <typename Tag> + void parallel_for_implementation() { + using PolicyType = + Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>, Tag>; + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + const std::string functor_name = + (std::is_same_v<Tag, DestroyTag> + ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Profiling::beginParallelFor( + functor_name, Kokkos::Profiling::Experimental::device_id(space), + &kpID); + } + +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<ExecSpace, Kokkos::Cuda>::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } +#endif + const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure( + *this, policy); + closure.execute(); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space || std::is_same_v<Tag, DestroyTag>) { + space.fence(std::is_same_v<Tag, DestroyTag> + ? "Kokkos::View::destruction before deallocate" + : "Kokkos::View::initialization"); + } + } + + // Shortcut for zero initialization + void zero_memset_implementation() { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset(space, ptr, n * sizeof(ValueType)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) { + space.fence("Kokkos::View::initialization via memset"); + } + } + + void construct_shared_allocation() { +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if constexpr (std::is_trivial_v<ValueType>) { + // value-initialization is equivalent to filling with zeros + zero_memset_implementation(); + } else +#endif + parallel_for_implementation<ConstructTag>(); + } + + void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v<ValueType>) { + // do nothing, don't bother calling the destructor + } else { +#ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND + if constexpr (std::is_same_v<typename ExecSpace::memory_space, + Kokkos::HostSpace>) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else +#endif + parallel_for_implementation<DestroyTag>(); + } + } + + // This function is to ensure that the functor with DestroyTag is instantiated + // This is a workaround to avoid "cudaErrorInvalidDeviceFunction" error later + // when the function is queried with cudaFuncGetAttributes + void functor_instantiate_workaround() { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) + if (false) { + parallel_for_implementation<DestroyTag>(); + } +#endif + } +}; + +template <class DeviceType, class ValueType> +struct ViewValueFunctorSequentialHostInit { + using ExecSpace = typename DeviceType::execution_space; + using MemSpace = typename DeviceType::memory_space; + static_assert(SpaceAccessibility<HostSpace, MemSpace>::accessible); + + ValueType* ptr; + size_t n; + + ViewValueFunctorSequentialHostInit() = default; + + ViewValueFunctorSequentialHostInit(ExecSpace const& /*arg_space*/, + ValueType* const arg_ptr, + size_t const arg_n, + std::string /*arg_name*/) + : ptr(arg_ptr), n(arg_n) {} + + ViewValueFunctorSequentialHostInit(ValueType* const arg_ptr, + size_t const arg_n, + std::string /*arg_name*/) + : ptr(arg_ptr), n(arg_n) {} + + void construct_shared_allocation() { + if constexpr (std::is_trivial_v<ValueType>) { + // value-initialization is equivalent to filling with zeros + std::memset(static_cast<void*>(ptr), 0, n * sizeof(ValueType)); + } else { + for (size_t i = 0; i < n; ++i) { + new (ptr + i) ValueType(); + } + } + } + + void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v<ValueType>) { + // do nothing, don't bother calling the destructor + } else { + for (size_t i = 0; i < n; ++i) { + (ptr + i)->~ValueType(); + } + } + } +}; + +template <class ElementType, class MemorySpace, class ExecutionSpace, + bool Initialize, bool SequentialInit> +Kokkos::Impl::SharedAllocationRecord<void, void>* make_shared_allocation_record( + const size_t& required_span_size, std::string_view label, + const MemorySpace& memory_space, + const std::optional<ExecutionSpace> exec_space, + std::bool_constant<Initialize>, std::bool_constant<SequentialInit>) { + static_assert(SpaceAccessibility<ExecutionSpace, MemorySpace>::accessible); + + // Use this for constructing and destroying the view + using device_type = Kokkos::Device<ExecutionSpace, MemorySpace>; + using functor_type = std::conditional_t< + SequentialInit, + ViewValueFunctorSequentialHostInit<device_type, ElementType>, + ViewValueFunctor<device_type, ElementType>>; + using record_type = + Kokkos::Impl::SharedAllocationRecord<MemorySpace, functor_type>; + + /* Force alignment of allocations on on 8 byte boundaries even for + * element types smaller than 8 bytes */ + static constexpr std::size_t align_mask = 0x7; + + // Calculate the total size of the memory, in bytes, and make sure it is + // byte-aligned + const std::size_t alloc_size = + (required_span_size * sizeof(ElementType) + align_mask) & ~align_mask; + + auto* record = + exec_space + ? record_type::allocate(*exec_space, memory_space, std::string{label}, + alloc_size) + : record_type::allocate(memory_space, std::string{label}, alloc_size); + + auto ptr = static_cast<ElementType*>(record->data()); + + auto functor = + exec_space ? functor_type(*exec_space, ptr, required_span_size, + std::string{label}) + : functor_type(ptr, required_span_size, std::string{label}); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if constexpr (Initialize) { + if (alloc_size) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = std::move(functor); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + } + + return record; +} + +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp similarity index 96% rename from packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp index 23d4c2524c791913f4bf8aeeabf00e2bed8edd25..f77066b70f57f204a9f924d7228cca8437e42296 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewAtomic.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_ATOMIC_VIEW_HPP -#define KOKKOS_ATOMIC_VIEW_HPP +#ifndef KOKKOS_VIEWATOMIC_HPP +#define KOKKOS_VIEWATOMIC_HPP #include <Kokkos_Macros.hpp> #include <Kokkos_Atomic.hpp> @@ -44,10 +44,10 @@ class AtomicDataElement { } KOKKOS_INLINE_FUNCTION - void inc() const { Kokkos::atomic_increment(ptr); } + void inc() const { Kokkos::atomic_inc(ptr); } KOKKOS_INLINE_FUNCTION - void dec() const { Kokkos::atomic_decrement(ptr); } + void dec() const { Kokkos::atomic_dec(ptr); } KOKKOS_INLINE_FUNCTION const_value_type operator++() const { @@ -215,7 +215,7 @@ class AtomicViewDataHandle { } KOKKOS_INLINE_FUNCTION - operator typename ViewTraits::value_type*() const { return ptr; } + operator typename ViewTraits::value_type *() const { return ptr; } }; } // namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp similarity index 78% rename from packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp index e1b8ba86a5b5b58721071548d1c68d6986227257..f08047471728d07d8368cbbb2fe309f784f53542 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -23,12 +23,16 @@ namespace Kokkos { namespace Impl { +struct SequentialHostInit_t {}; struct WithoutInitializing_t {}; struct AllowPadding_t {}; template <typename> struct is_view_ctor_property : public std::false_type {}; +template <> +struct is_view_ctor_property<SequentialHostInit_t> : public std::true_type {}; + template <> struct is_view_ctor_property<WithoutInitializing_t> : public std::true_type {}; @@ -68,8 +72,8 @@ struct ViewCtorProp<void> {}; */ template <typename Specialize, typename T> struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T>> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = CommonViewAllocProp<Specialize, T>; @@ -84,12 +88,12 @@ struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T>> { /* Property flags have constexpr value */ template <typename P> -struct ViewCtorProp< - std::enable_if_t<std::is_same<P, AllowPadding_t>::value || - std::is_same<P, WithoutInitializing_t>::value>, - P> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; +struct ViewCtorProp<std::enable_if_t<std::is_same_v<P, AllowPadding_t> || + std::is_same_v<P, WithoutInitializing_t> || + std::is_same_v<P, SequentialHostInit_t>>, + P> { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = P; @@ -102,14 +106,14 @@ struct ViewCtorProp< /* Map input label type to std::string */ template <typename Label> struct ViewCtorProp<std::enable_if_t<is_view_label<Label>::value>, Label> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = std::string; ViewCtorProp(const type &arg) : value(arg) {} - ViewCtorProp(type &&arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(std::move(arg)) {} type value; }; @@ -118,8 +122,8 @@ template <typename Space> struct ViewCtorProp<std::enable_if_t<Kokkos::is_memory_space<Space>::value || Kokkos::is_execution_space<Space>::value>, Space> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = Space; @@ -131,8 +135,8 @@ struct ViewCtorProp<std::enable_if_t<Kokkos::is_memory_space<Space>::value || template <typename T> struct ViewCtorProp<void, T *> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = T *; @@ -199,19 +203,29 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... { Kokkos::Impl::has_type<AllowPadding_t, P...>::value; static constexpr bool initialize = !Kokkos::Impl::has_type<WithoutInitializing_t, P...>::value; + static constexpr bool sequential_host_init = + Kokkos::Impl::has_type<SequentialHostInit_t, P...>::value; + static_assert(initialize || !sequential_host_init, + "Incompatible WithoutInitializing and SequentialHostInit view " + "alloc properties"); using memory_space = typename var_memory_space::type; using execution_space = typename var_execution_space::type; using pointer_type = typename var_pointer::type; - /* Copy from a matching argument list. - * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... - */ - template <typename... Args> - inline ViewCtorProp(Args const &... args) : ViewCtorProp<void, P>(args)... {} + // Construct from a matching argument list. + // + // Note that if P is empty, this constructor is the default constructor. + // On the other hand, if P is not empty, the constraint implies that + // there is no default constructor. + template <typename... Args, + typename = std::enable_if_t<std::conjunction_v< + std::is_constructible<view_ctor_prop_base<P>, Args &&>...>>> + ViewCtorProp(Args &&...args) + : ViewCtorProp<void, P>(std::forward<Args>(args))... {} template <typename... Args> - KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &...args) : ViewCtorProp<void, pointer_type>(arg0), ViewCtorProp<void, typename ViewCtorProp<void, Args>::type>(args)... {} @@ -243,7 +257,7 @@ auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop) { template <typename... P, typename Property, typename... Properties> auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop, [[maybe_unused]] const Property &property, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space<Property>::value && !ViewCtorProp<P...>::has_execution_space) || (is_memory_space<Property>::value && @@ -251,7 +265,9 @@ auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop, (is_view_label<Property>::value && !ViewCtorProp<P...>::has_label) || (std::is_same_v<Property, WithoutInitializing_t> && - ViewCtorProp<P...>::initialize)) { + ViewCtorProp<P...>::initialize) || + (std::is_same_v<Property, SequentialHostInit_t> && + !ViewCtorProp<P...>::sequential_host_init)) { using NewViewCtorProp = ViewCtorProp<P..., Property>; NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value = @@ -291,7 +307,7 @@ template <class... P, class Property, class... Properties> struct WithPropertiesIfUnset<ViewCtorProp<P...>, Property, Properties...> { static constexpr auto apply_prop(const ViewCtorProp<P...> &view_ctor_prop, const Property &prop, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space<Property>::value && !ViewCtorProp<P...>::has_execution_space) || (is_memory_space<Property>::value && @@ -299,7 +315,9 @@ struct WithPropertiesIfUnset<ViewCtorProp<P...>, Property, Properties...> { (is_view_label<Property>::value && !ViewCtorProp<P...>::has_label) || (std::is_same_v<Property, WithoutInitializing_t> && - ViewCtorProp<P...>::initialize)) { + ViewCtorProp<P...>::initialize) || + (std::is_same_v<Property, SequentialHostInit_t> && + !ViewCtorProp<P...>::sequential_host_init)) { using NewViewCtorProp = ViewCtorProp<P..., Property>; NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value = @@ -315,7 +333,7 @@ struct WithPropertiesIfUnset<ViewCtorProp<P...>, Property, Properties...> { template <typename... P, class... Properties> auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop, - const Properties &... properties) { + const Properties &...properties) { return WithPropertiesIfUnset<ViewCtorProp<P...>, Properties...>::apply_prop( view_ctor_prop, properties...); } @@ -424,6 +442,48 @@ using ViewAllocateWithoutInitializing = Impl::ViewCtorProp<Impl::WithoutInitializing_t, std::string, Impl::ViewAllocateWithoutInitializingBackwardCompat>; +inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; + +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; + +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template <class... Args> +auto view_alloc(Args &&...args) { + using return_type = Impl::ViewCtorProp<typename Impl::ViewCtorProp< + void, Kokkos::Impl::remove_cvref_t<Args>>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(std::forward<Args>(args)...); +} + +template <class... Args> +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...> + view_wrap(Args const &...args) { + using return_type = + Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp b/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp new file mode 100644 index 0000000000000000000000000000000000000000..37b6e2802fc9deb7c94c95815c74d7005e36171f --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -0,0 +1,401 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_DATA_ANALYSIS_HPP +#define KOKKOS_VIEW_DATA_ANALYSIS_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos::Impl { + +template <unsigned I, size_t... Args> +struct variadic_size_t { + enum : size_t { value = KOKKOS_INVALID_INDEX }; +}; + +template <size_t Val, size_t... Args> +struct variadic_size_t<0, Val, Args...> { + enum : size_t { value = Val }; +}; + +template <unsigned I, size_t Val, size_t... Args> +struct variadic_size_t<I, Val, Args...> { + enum : size_t { value = variadic_size_t<I - 1, Args...>::value }; +}; + +template <size_t... Args> +struct rank_dynamic; + +template <> +struct rank_dynamic<> { + enum : unsigned { value = 0 }; +}; + +template <size_t Val, size_t... Args> +struct rank_dynamic<Val, Args...> { + enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic<Args...>::value }; +}; + +#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ + template <size_t V, unsigned> \ + struct ViewDimension##R { \ + static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + }; \ + template <size_t V, unsigned RD> \ + constexpr size_t ViewDimension##R<V, RD>::ArgN##R; \ + template <size_t V, unsigned RD> \ + constexpr size_t ViewDimension##R<V, RD>::N##R; \ + template <unsigned RD> \ + struct ViewDimension##R<0u, RD> { \ + static constexpr size_t ArgN##R = 0; \ + std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ + }; \ + template <unsigned RD> \ + constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; + +KOKKOS_IMPL_VIEW_DIMENSION(0) +KOKKOS_IMPL_VIEW_DIMENSION(1) +KOKKOS_IMPL_VIEW_DIMENSION(2) +KOKKOS_IMPL_VIEW_DIMENSION(3) +KOKKOS_IMPL_VIEW_DIMENSION(4) +KOKKOS_IMPL_VIEW_DIMENSION(5) +KOKKOS_IMPL_VIEW_DIMENSION(6) +KOKKOS_IMPL_VIEW_DIMENSION(7) + +#undef KOKKOS_IMPL_VIEW_DIMENSION + +// MSVC does not do empty base class optimization by default. +// Per standard it is required for standard layout types +template <size_t... Vals> +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension + : public ViewDimension0<variadic_size_t<0u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension1<variadic_size_t<1u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension2<variadic_size_t<2u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension3<variadic_size_t<3u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension4<variadic_size_t<4u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension5<variadic_size_t<5u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension6<variadic_size_t<6u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension7<variadic_size_t<7u, Vals...>::value, + rank_dynamic<Vals...>::value> { + using D0 = ViewDimension0<variadic_size_t<0U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D1 = ViewDimension1<variadic_size_t<1U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D2 = ViewDimension2<variadic_size_t<2U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D3 = ViewDimension3<variadic_size_t<3U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D4 = ViewDimension4<variadic_size_t<4U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D5 = ViewDimension5<variadic_size_t<5U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D6 = ViewDimension6<variadic_size_t<6U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D7 = ViewDimension7<variadic_size_t<7U, Vals...>::value, + rank_dynamic<Vals...>::value>; + + using D0::ArgN0; + using D1::ArgN1; + using D2::ArgN2; + using D3::ArgN3; + using D4::ArgN4; + using D5::ArgN5; + using D6::ArgN6; + using D7::ArgN7; + + using D0::N0; + using D1::N1; + using D2::N2; + using D3::N3; + using D4::N4; + using D5::N5; + using D6::N6; + using D7::N7; + + static constexpr unsigned rank = sizeof...(Vals); + static constexpr unsigned rank_dynamic = Impl::rank_dynamic<Vals...>::value; + + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; + ViewDimension& operator=(const ViewDimension&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, + size_t n5, size_t n6, size_t n7) + : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), + D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), + D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), + D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), + D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), + D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), + D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), + D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} + + KOKKOS_INLINE_FUNCTION + constexpr size_t extent(const unsigned r) const noexcept { + return r == 0 + ? N0 + : (r == 1 + ? N1 + : (r == 2 + ? N2 + : (r == 3 + ? N3 + : (r == 4 + ? N4 + : (r == 5 + ? N5 + : (r == 6 + ? N6 + : (r == 7 ? N7 + : 0))))))); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return r == 0 + ? ArgN0 + : (r == 1 + ? ArgN1 + : (r == 2 + ? ArgN2 + : (r == 3 + ? ArgN3 + : (r == 4 + ? ArgN4 + : (r == 5 + ? ArgN5 + : (r == 6 + ? ArgN6 + : (r == 7 ? ArgN7 + : 0))))))); + } + + template <size_t N> + struct prepend { + using type = ViewDimension<N, Vals...>; + }; + + template <size_t N> + struct append { + using type = ViewDimension<Vals..., N>; + }; +}; + +template <class A, class B> +struct ViewDimensionJoin; + +template <size_t... A, size_t... B> +struct ViewDimensionJoin<ViewDimension<A...>, ViewDimension<B...>> { + using type = ViewDimension<A..., B...>; +}; + +//---------------------------------------------------------------------------- + +template <class DstDim, class SrcDim> +struct ViewDimensionAssignable; + +template <size_t... DstArgs, size_t... SrcArgs> +struct ViewDimensionAssignable<ViewDimension<DstArgs...>, + ViewDimension<SrcArgs...>> { + using dst = ViewDimension<DstArgs...>; + using src = ViewDimension<SrcArgs...>; + + enum { + value = unsigned(dst::rank) == unsigned(src::rank) && + ( + // Compile time check that potential static dimensions match + ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) + ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) + : true) && + ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) + ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) + : true) && + ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) + ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) + : true) && + ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) + ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) + : true) && + ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) + ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) + : true) && + ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) + ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) + : true) && + ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) + ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) + : true) && + ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) + ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) + : true)) + }; +}; + +/** \brief Given a value type and dimension generate the View data type */ +template <class T, class Dim> +struct ViewDataType; + +template <class T> +struct ViewDataType<T, ViewDimension<>> { + using type = T; +}; + +template <class T, size_t... Args> +struct ViewDataType<T, ViewDimension<0, Args...>> { + using type = typename ViewDataType<T*, ViewDimension<Args...>>::type; +}; + +template <class T, size_t N, size_t... Args> +struct ViewDataType<T, ViewDimension<N, Args...>> { + using type = typename ViewDataType<T, ViewDimension<Args...>>::type[N]; +}; + +/**\brief Analysis of View data type. + * + * Data type conforms to one of the following patterns : + * {const} value_type [][#][#][#] + * {const} value_type ***[#][#][#] + * Where the sum of counts of '*' and '[#]' is at most ten. + * + * Provide alias for ViewDimension<...> and value_type. + */ +template <class T> +struct ViewArrayAnalysis { + using value_type = T; + using const_value_type = std::add_const_t<T>; + using non_const_value_type = std::remove_const_t<T>; + using static_dimension = ViewDimension<>; + using dynamic_dimension = ViewDimension<>; + using dimension = ViewDimension<>; +}; + +template <class T, size_t N> +struct ViewArrayAnalysis<T[N]> { + private: + using nested = ViewArrayAnalysis<T>; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using static_dimension = + typename nested::static_dimension::template prepend<N>::type; + + using dynamic_dimension = typename nested::dynamic_dimension; + + using dimension = + typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; +}; + +template <class T> +struct ViewArrayAnalysis<T[]> { + private: + using nested = ViewArrayAnalysis<T>; + using nested_dimension = typename nested::dimension; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; +}; + +template <class T> +struct ViewArrayAnalysis<T*> { + private: + using nested = ViewArrayAnalysis<T>; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; +}; + +template <class DataType, class ArrayLayout, class ValueType> +struct ViewDataAnalysis { + private: + using array_analysis = ViewArrayAnalysis<DataType>; + + // ValueType is opportunity for partial specialization. + // Must match array analysis when this default template is used. + static_assert( + std::is_same_v<ValueType, typename array_analysis::non_const_value_type>); + + public: + using specialize = void; // No specialization + + using dimension = typename array_analysis::dimension; + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + // Generate analogous multidimensional array specification type. + using type = typename ViewDataType<value_type, dimension>::type; + using const_type = typename ViewDataType<const_value_type, dimension>::type; + using non_const_type = + typename ViewDataType<non_const_value_type, dimension>::type; + + // Generate "flattened" multidimensional array specification type. + using scalar_array_type = type; + using const_scalar_array_type = const_type; + using non_const_scalar_array_type = non_const_type; +}; + +template <class Dimension, class Layout, class Enable = void> +struct ViewOffset { + using is_mapping_plugin = std::false_type; +}; +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_DATA_ANALYSIS_HPP diff --git a/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fd406d58ccaea007bab3feb3bfe2c1dbe063f676 --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -0,0 +1,1604 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include <Kokkos_Macros.hpp> +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWLEGACY_HPP +#define KOKKOS_VIEWLEGACY_HPP + +#include <type_traits> +#include <string> +#include <algorithm> +#include <initializer_list> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <View/Hooks/Kokkos_ViewHooks.hpp> + +#include <impl/Kokkos_Tools.hpp> +#include <impl/Kokkos_Utilities.hpp> + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include <View/MDSpan/Kokkos_MDSpan_Extents.hpp> +#include <View/MDSpan/Kokkos_MDSpan_Layout.hpp> +#include <View/MDSpan/Kokkos_MDSpan_Accessor.hpp> +#endif +#include <Kokkos_MinMax.hpp> + +#include <View/Kokkos_ViewTraits.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, <tt>double*</tt> + * indicates a one-dimensional array of \c double with run-time + * dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * <tt>Space</tt>. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View<double*, Cuda> out, + * View<const double*, Cuda> in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View<const double*, Cuda, RandomAccess> in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template <class T1, class T2> +struct is_always_assignable_impl; + +template <class... ViewTDst, class... ViewTSrc> +struct is_always_assignable_impl<Kokkos::View<ViewTDst...>, + Kokkos::View<ViewTSrc...>> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View<ViewTDst...>::traits, + typename Kokkos::View<ViewTSrc...>::traits, + typename Kokkos::View<ViewTDst...>::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast<int>(Kokkos::View<ViewTDst...>::rank_dynamic) >= + static_cast<int>(Kokkos::View<ViewTSrc...>::rank_dynamic); +}; + +template <class View1, class View2> +using is_always_assignable = is_always_assignable_impl< + std::remove_reference_t<View1>, + std::remove_const_t<std::remove_reference_t<View2>>>; + +template <class T1, class T2> +inline constexpr bool is_always_assignable_v = + is_always_assignable<T1, T2>::value; + +template <class... ViewTDst, class... ViewTSrc> +constexpr bool is_assignable(const Kokkos::View<ViewTDst...>& dst, + const Kokkos::View<ViewTSrc...>& src) { + using DstTraits = typename Kokkos::View<ViewTDst...>::traits; + using SrcTraits = typename Kokkos::View<ViewTSrc...>::traits; + using mapping_type = + Kokkos::Impl::ViewMapping<DstTraits, SrcTraits, + typename DstTraits::specialize>; + + return is_always_assignable_v<Kokkos::View<ViewTDst...>, + Kokkos::View<ViewTSrc...>> || + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <View/Kokkos_ViewMapping.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class DataType, class... Properties> +class View; + +template <class> +struct is_view : public std::false_type {}; + +template <class D, class... P> +struct is_view<View<D, P...>> : public std::true_type {}; + +template <class D, class... P> +struct is_view<const View<D, P...>> : public std::true_type {}; + +template <class T> +inline constexpr bool is_view_v = is_view<T>::value; + +template <class DataType, class... Properties> +class View : public ViewTraits<DataType, Properties...> { + private: + template <class, class...> + friend class View; + template <class, class...> + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker<View>; + + public: + using traits = ViewTraits<DataType, Properties...>; + + private: + using map_type = + Kokkos::Impl::ViewMapping<traits, typename traits::specialize>; + template <typename V> + friend struct Kokkos::Impl::ViewTracker; + using hooks_policy = typename traits::hooks_policy; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View<typename traits::scalar_array_type, typename traits::array_layout, + typename traits::device_type, typename traits::hooks_policy, + typename traits::memory_traits>; + + /** \brief Compatible view of const data type */ + using const_type = + View<typename traits::const_data_type, typename traits::array_layout, + typename traits::device_type, typename traits::hooks_policy, + typename traits::memory_traits>; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View<typename traits::non_const_data_type, typename traits::array_layout, + typename traits::device_type, typename traits::hooks_policy, + typename traits::memory_traits>; + + /** \brief Compatible host mirror view */ + using host_mirror_type = + View<typename traits::non_const_data_type, typename traits::array_layout, + Device<DefaultHostExecutionSpace, + typename traits::host_mirror_space::memory_space>, + typename traits::hooks_policy>; + + /** \brief Compatible host mirror view */ + using HostMirror = host_mirror_type; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType<View, 0>::type; + using uniform_const_type = + typename Impl::ViewUniformType<View, 0>::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType<View, 0>::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType<View, 0>::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::runtime_const_nomemspace_type; + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + // Typedefs from mdspan + // using extents_type -> not applicable + // Defining layout_type here made MSVC+CUDA fail + // using layout_type = typename traits::array_layout; + // using accessor_type -> not applicable + // using mapping_type -> not applicable + using element_type = typename traits::value_type; + // using value_type -> conflicts with traits::value_type + using index_type = typename traits::memory_space::size_type; + // using size_type -> already from traits::size_type; where it is + // memory_space::size_type + using rank_type = size_t; + using data_handle_type = pointer_type; + using reference = reference_type; + + //---------------------------------------- + // Domain rank and extents + + static constexpr Impl::integral_constant<size_t, traits::dimension::rank> + rank = {}; + static constexpr Impl::integral_constant<size_t, + traits::dimension::rank_dynamic> + rank_dynamic = {}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<std::is_integral_v<iType>, + size_t> + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<std::is_integral_v<iType>, + int> + extent_int(const iType& r) const noexcept { + return static_cast<int>(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<std::is_integral_v<iType>, + size_t> + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference_v<reference_type> + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same_v<typename traits::array_layout, Kokkos::LayoutLeft>; + + static constexpr bool is_layout_right = + std::is_same_v<typename traits::array_layout, Kokkos::LayoutRight>; + + static constexpr bool is_layout_stride = + std::is_same_v<typename traits::array_layout, Kokkos::LayoutStride>; + + static constexpr bool is_default_map = + std::is_void_v<typename traits::specialize> && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); \ + Kokkos::Impl::view_verify_operator_bounds<typename traits::memory_space>( \ + __VA_ARGS__); + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); + +#endif + + template <typename... Is> + static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral<Is...>::value); + } + + template <typename... Is> + static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral<Is...>::value); + } + + public: + //------------------------------ + // Rank 1 default map operator() + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0>::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0>::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 1 operator[] + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == rank) && Kokkos::Impl::are_integral<I0>::value && !is_default_map), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.reference(i0); + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral<I0>::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral<I0>::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 default map operator() + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1>::value && // + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + operator()(I0 i0, I1 i1) const { + check_operator_parens_valid_args(i0, i1); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which + // have "inlined" versions above + + template <typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<Is...>::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; + } + + template <typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<Is...>::value && // + ((0 == rank) || !is_default_map)), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.reference(indices...); + } + + //------------------------------ + // Rank 0 + + template <typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<Is...>::value && (0 == rank)), reference_type> + access(Is... extra) const { + check_access_member_function_valid_args(extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) + return m_map.reference(); + } + + //------------------------------ + // Rank 1 + + template <typename I0, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value && + (1 == rank) && !is_default_map), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.reference(i0); + } + + template <typename I0, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[i0]; + } + + template <typename I0, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, Is...>::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template <typename I0, typename I1, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, Is...>::value && + (2 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + return m_map.reference(i0, i1); + } + + template <typename I0, typename I1, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, Is...>::value && (2 == rank) && + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + //------------------------------ + // Rank 3 + + template <typename I0, typename I1, typename I2, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value && + (3 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template <typename I0, typename I1, typename I2, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, Is...>::value && + (3 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template <typename I0, typename I1, typename I2, typename I3, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == rank) && + is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, Is...>::value && (4 == rank) && + !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value && + (5 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, Is...>::value && + (5 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value && + (6 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, Is...>::value && + (6 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value && + (7 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, Is...>::value && + (7 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, + I7, Is...>::value && + (8 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7, typename... Is> + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true<I0, I1, I2, I3, I4, I5, I6, + I7, Is...>::value && + (8 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_FUNCTION + View(const View& other) : m_track(other.m_track), m_map(other.m_map) { + KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View(View&& other) + : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { + KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View& operator=(const View& other) { + m_map = other.m_map; + m_track = other.m_track; + + KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) + + return *this; + } + + KOKKOS_FUNCTION + View& operator=(View&& other) { + m_map = std::move(other.m_map); + m_track = std::move(other.m_track); + + KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) + + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION View( + const View<RT, RP...>& rhs, + std::enable_if_t<Kokkos::Impl::ViewMapping< + traits, typename View<RT, RP...>::traits, + typename traits::specialize>::is_assignable_data_type>* = nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, + typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION std::enable_if_t< + Kokkos::Impl::ViewMapping< + traits, typename View<RT, RP...>::traits, + typename traits::specialize>::is_assignable_data_type, + View>& + operator=(const View<RT, RP...>& rhs) { + using SrcTraits = typename View<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, + typename traits::specialize>; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template <class RT, class... RP, class Arg0, class... Args> + KOKKOS_INLINE_FUNCTION View(const View<RT, RP...>& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View<RT, RP...>; + + using Mapping = Kokkos::Impl::ViewMapping<void, typename SrcType::traits, + Arg0, Args...>; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping<traits, typename DstType::traits, + typename traits::specialize>::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label<typename traits::memory_space>(); + } + + public: + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template <class... P> + explicit inline View( + const Impl::ViewCtorProp<P...>& arg_prop, + std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track(), m_map() { + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, typename traits::device_type::memory_space{}, + typename traits::device_type::execution_space{}); + using alloc_prop = decltype(prop_copy); + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v<typename traits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename traits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename traits::array_layout, + Kokkos::LayoutStride>) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property<Impl::LabelTag>(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same<typename traits::specialize, void>::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, arg_layout, Impl::ViewCtorProp<P...>::has_execution_space); + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template <class... P> + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp<P...>& arg_prop, + std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same<pointer_type, + typename Impl::ViewCtorProp<P...>::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v<typename traits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename traits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename traits::array_layout, + Kokkos::LayoutStride>) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same<typename traits::specialize, void>::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif + } + + // Simple dimension-only layout + template <class... P> + explicit inline View( + const Impl::ViewCtorProp<P...>& arg_prop, + std::enable_if_t<!Impl::ViewCtorProp<P...>::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + template <class... P> + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp<P...>& arg_prop, + std::enable_if_t<Impl::ViewCtorProp<P...>::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Allocate with label and layout + template <typename Label> + explicit inline View( + const Label& arg_label, + std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value, + typename traits::array_layout> const& arg_layout) + : View(Impl::ViewCtorProp<std::string>(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template <typename Label> + explicit inline View( + const Label& arg_label, + std::enable_if_t<Kokkos::Impl::is_view_label<Label>::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp<std::string>(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template <class Traits> + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping<Traits, typename Traits::specialize>& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping<traits, Traits, typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template <class Traits> + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping<Traits, typename Traits::specialize>& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping<traits, Traits, typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + typename traits::array_layout const& layout) { + return map_type::memory_span(layout); + } + + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp<pointer_type>(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp<pointer_type>(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_void_v<typename traits::specialize> && + num_passed_args != rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + max({sizeof(typename traits::value_type), + alignof(typename traits::value_type), + static_cast<size_t>( + traits::execution_space::scratch_memory_space::ALIGN)}); + + public: + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + scratch_value_alignment; + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp<pointer_type>(reinterpret_cast<pointer_type>( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + scratch_value_alignment))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template <typename U = typename Impl::MDSpanViewTraits<traits>::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits<traits>::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v<Impl::UnsupportedKokkosArrayLayout, U>>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits<traits>::mdspan_type>( + mds.mapping())) { + } + + template <class ElementType, class ExtentsType, class LayoutType, + class AccessorType> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan<ElementType, ExtentsType, LayoutType, + AccessorType>, + typename Impl::MDSpanViewTraits<traits>::mdspan_type>) +#endif + View(const Kokkos::mdspan<ElementType, ExtentsType, LayoutType, + AccessorType>& mds) + : View(typename Impl::MDSpanViewTraits<traits>::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template <class OtherElementType, class OtherExtents, class OtherLayoutPolicy, + class OtherAccessor, + class ImplNaturalMDSpanType = + typename Impl::MDSpanViewTraits<traits>::mdspan_type, + typename = std::enable_if_t<std::conditional_t< + std::is_same_v<Impl::UnsupportedKokkosArrayLayout, + ImplNaturalMDSpanType>, + std::false_type, + std::is_assignable<mdspan<OtherElementType, OtherExtents, + OtherLayoutPolicy, OtherAccessor>, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits<traits>::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping<mdspan_type>(m_map)}; + } + + template <class OtherAccessorType = Impl::SpaceAwareAccessor< + typename traits::memory_space, + Kokkos::default_accessor<typename traits::value_type>>, + typename = std::enable_if_t<std::is_assignable_v< + typename traits::value_type*&, + typename OtherAccessorType::data_handle_type>>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits<traits>::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits<traits>::mdspan_type; + using ret_mdspan_type = + mdspan<typename mdspan_type::element_type, + typename mdspan_type::extents_type, + typename mdspan_type::layout_type, OtherAccessorType>; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping<mdspan_type>(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN +}; + +template <typename D, class... P> +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View<D, P...>&) { + return View<D, P...>::rank(); +} + +namespace Impl { + +template <typename ValueType, unsigned int Rank> +struct RankDataType { + using type = typename RankDataType<ValueType, Rank - 1>::type*; +}; + +template <typename ValueType> +struct RankDataType<ValueType, 0> { + using type = ValueType; +}; + +template <unsigned N, typename... Args> +KOKKOS_FUNCTION std::enable_if_t< + N == View<Args...>::rank() && + std::is_same_v<typename ViewTraits<Args...>::specialize, void>, + View<Args...>> +as_view_of_rank_n(View<Args...> v) { + return v; +} + +// Placeholder implementation to compile generic code for DynRankView; should +// never be called +template <unsigned N, typename T, typename... Args> +KOKKOS_FUNCTION std::enable_if_t< + N != View<T, Args...>::rank() && + std::is_same_v<typename ViewTraits<T, Args...>::specialize, void>, + View<typename RankDataType<typename View<T, Args...>::value_type, N>::type, + Args...>> +as_view_of_rank_n(View<T, Args...>) { + Kokkos::abort("Trying to get at a View of the wrong rank"); + return {}; +} + +template <typename Function, typename... Args> +void apply_to_view_of_static_rank(Function&& f, View<Args...> a) { + f(a); +} + +} // namespace Impl + +template <class D, class... P, class... Args> +KOKKOS_INLINE_FUNCTION auto subview(const View<D, P...>& src, Args... args) { + static_assert(View<D, P...>::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait<D, P...>::type, + Args...>::type(src, args...); +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template <class MemoryTraits, class D, class... P, class... Args> +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View<D, P...>& src, + Args... args) { + static_assert(View<D, P...>::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits<MemoryTraits>::value); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait<D, P..., MemoryTraits>::type, + Args...>::type(src, args...); +} +#endif + +template <class V, class... Args> +using Subview = decltype(subview(std::declval<V>(), std::declval<Args>()...)); + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs, + const View<RT, RP...>& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits<LT, LP...>; + using rhs_traits = ViewTraits<RT, RP...>; + + return std::is_same_v<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type> && + std::is_same_v<typename lhs_traits::array_layout, + typename rhs_traits::array_layout> && + std::is_same_v<typename lhs_traits::memory_space, + typename rhs_traits::memory_space> && + View<LT, LP...>::rank() == View<RT, RP...>::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator!=(const View<LT, LP...>& lhs, + const View<RT, RP...>& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class Specialize, typename A, typename B> +struct CommonViewValueType; + +template <typename A, typename B> +struct CommonViewValueType<void, A, B> { + using value_type = std::common_type_t<A, B>; +}; + +template <class Specialize, class ValueType> +struct CommonViewAllocProp; + +template <class ValueType> +struct CommonViewAllocProp<void, ValueType> { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template <class... Views> + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template <class... Views> +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template <class FirstView> +struct DeduceCommonViewAllocProp<FirstView> { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view<FirstView>::value }; + + using prop_type = CommonViewAllocProp<specialize, value_type>; +}; + +template <class FirstView, class... NextViews> +struct DeduceCommonViewAllocProp<FirstView, NextViews...> { + using NextTraits = DeduceCommonViewAllocProp<NextViews...>; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view<FirstView>::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same_v<first_specialize, next_specialize> && + !std::is_void_v<first_specialize> && + !std::is_void_v<next_specialize>), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = + std::conditional_t<std::is_same_v<first_specialize, next_specialize>, + first_specialize, + std::conditional_t<(std::is_void_v<first_specialize> && + !std::is_void_v<next_specialize>), + next_specialize, first_specialize>>; + + using value_type = typename CommonViewValueType<specialize, first_value_type, + next_value_type>::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp<specialize, value_type>; +}; + +} // end namespace Impl + +template <class... Views> +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type; + +// This function is required in certain scenarios where users customize +// Kokkos View internals. One example are dynamic length embedded ensemble +// types. The function is used to propagate necessary information +// (like the ensemble size) when creating new views. +// However, most of the time it is called with a single view. +// Furthermore, the propagated information is not just for view allocations. +// From what I can tell, the type of functionality provided by +// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, +// a mechanism we will eventually use to replace this clunky approach here, when +// we are finally mdspan based. +// TODO: get rid of this when we have mdspan +template <class... Views> +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType<Views...> common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType<Views...>(views...); +} + +} // namespace Kokkos + +#include <View/Kokkos_ViewUniformType.hpp> +#include <View/Kokkos_ViewAtomic.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWLEGACY_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp similarity index 70% rename from packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 01d0dc4f68112bea94818ac75c5d0ca61716a16f..ecc19eaf5e25fcf69433d209c877b13740277496 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -17,6 +17,7 @@ #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP #define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#include <cstring> #include <type_traits> #include <initializer_list> @@ -27,12 +28,15 @@ #include <Kokkos_Extents.hpp> #include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Traits.hpp> -#include <impl/Kokkos_ViewTracker.hpp> -#include <impl/Kokkos_ViewCtor.hpp> -#include <impl/Kokkos_Atomic_View.hpp> +#include <View/Kokkos_ViewTracker.hpp> +#include <View/Kokkos_ViewTraits.hpp> +#include <View/Kokkos_ViewCtor.hpp> +#include <View/Kokkos_ViewAtomic.hpp> #include <impl/Kokkos_Tools.hpp> #include <impl/Kokkos_StringManipulation.hpp> #include <impl/Kokkos_ZeroMemset_fwd.hpp> +#include <View/Kokkos_ViewDataAnalysis.hpp> +#include <View/Kokkos_ViewAlloc.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -40,295 +44,25 @@ namespace Kokkos { namespace Impl { -template <unsigned I, size_t... Args> -struct variadic_size_t { - enum : size_t { value = KOKKOS_INVALID_INDEX }; -}; - -template <size_t Val, size_t... Args> -struct variadic_size_t<0, Val, Args...> { - enum : size_t { value = Val }; -}; - -template <unsigned I, size_t Val, size_t... Args> -struct variadic_size_t<I, Val, Args...> { - enum : size_t { value = variadic_size_t<I - 1, Args...>::value }; -}; - -template <size_t... Args> -struct rank_dynamic; - -template <> -struct rank_dynamic<> { - enum : unsigned { value = 0 }; -}; - -template <size_t Val, size_t... Args> -struct rank_dynamic<Val, Args...> { - enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic<Args...>::value }; -}; - -#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ - template <size_t V, unsigned> \ - struct ViewDimension##R { \ - static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - }; \ - template <size_t V, unsigned RD> \ - constexpr size_t ViewDimension##R<V, RD>::ArgN##R; \ - template <size_t V, unsigned RD> \ - constexpr size_t ViewDimension##R<V, RD>::N##R; \ - template <unsigned RD> \ - struct ViewDimension##R<0u, RD> { \ - static constexpr size_t ArgN##R = 0; \ - std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ - }; \ - template <unsigned RD> \ - constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; - -KOKKOS_IMPL_VIEW_DIMENSION(0) -KOKKOS_IMPL_VIEW_DIMENSION(1) -KOKKOS_IMPL_VIEW_DIMENSION(2) -KOKKOS_IMPL_VIEW_DIMENSION(3) -KOKKOS_IMPL_VIEW_DIMENSION(4) -KOKKOS_IMPL_VIEW_DIMENSION(5) -KOKKOS_IMPL_VIEW_DIMENSION(6) -KOKKOS_IMPL_VIEW_DIMENSION(7) - -#undef KOKKOS_IMPL_VIEW_DIMENSION - -// MSVC does not do empty base class optimization by default. -// Per standard it is required for standard layout types -template <size_t... Vals> -struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension - : public ViewDimension0<variadic_size_t<0u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension1<variadic_size_t<1u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension2<variadic_size_t<2u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension3<variadic_size_t<3u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension4<variadic_size_t<4u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension5<variadic_size_t<5u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension6<variadic_size_t<6u, Vals...>::value, - rank_dynamic<Vals...>::value>, - public ViewDimension7<variadic_size_t<7u, Vals...>::value, - rank_dynamic<Vals...>::value> { - using D0 = ViewDimension0<variadic_size_t<0U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D1 = ViewDimension1<variadic_size_t<1U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D2 = ViewDimension2<variadic_size_t<2U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D3 = ViewDimension3<variadic_size_t<3U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D4 = ViewDimension4<variadic_size_t<4U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D5 = ViewDimension5<variadic_size_t<5U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D6 = ViewDimension6<variadic_size_t<6U, Vals...>::value, - rank_dynamic<Vals...>::value>; - using D7 = ViewDimension7<variadic_size_t<7U, Vals...>::value, - rank_dynamic<Vals...>::value>; - - using D0::ArgN0; - using D1::ArgN1; - using D2::ArgN2; - using D3::ArgN3; - using D4::ArgN4; - using D5::ArgN5; - using D6::ArgN6; - using D7::ArgN7; - - using D0::N0; - using D1::N1; - using D2::N2; - using D3::N3; - using D4::N4; - using D5::N5; - using D6::N6; - using D7::N7; - - static constexpr unsigned rank = sizeof...(Vals); - static constexpr unsigned rank_dynamic = Impl::rank_dynamic<Vals...>::value; - - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; - ViewDimension& operator=(const ViewDimension&) = default; - - KOKKOS_INLINE_FUNCTION - constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, - size_t n5, size_t n6, size_t n7) - : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), - D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), - D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), - D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), - D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), - D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), - D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), - D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} - - KOKKOS_INLINE_FUNCTION - constexpr size_t extent(const unsigned r) const noexcept { - return r == 0 - ? N0 - : (r == 1 - ? N1 - : (r == 2 - ? N2 - : (r == 3 - ? N3 - : (r == 4 - ? N4 - : (r == 5 - ? N5 - : (r == 6 - ? N6 - : (r == 7 ? N7 - : 0))))))); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return r == 0 - ? ArgN0 - : (r == 1 - ? ArgN1 - : (r == 2 - ? ArgN2 - : (r == 3 - ? ArgN3 - : (r == 4 - ? ArgN4 - : (r == 5 - ? ArgN5 - : (r == 6 - ? ArgN6 - : (r == 7 ? ArgN7 - : 0))))))); - } - - template <size_t N> - struct prepend { - using type = ViewDimension<N, Vals...>; - }; - - template <size_t N> - struct append { - using type = ViewDimension<Vals..., N>; - }; -}; - -template <class A, class B> -struct ViewDimensionJoin; - -template <size_t... A, size_t... B> -struct ViewDimensionJoin<ViewDimension<A...>, ViewDimension<B...>> { - using type = ViewDimension<A..., B...>; -}; - -//---------------------------------------------------------------------------- - -template <class DstDim, class SrcDim> -struct ViewDimensionAssignable; - -template <size_t... DstArgs, size_t... SrcArgs> -struct ViewDimensionAssignable<ViewDimension<DstArgs...>, - ViewDimension<SrcArgs...>> { - using dst = ViewDimension<DstArgs...>; - using src = ViewDimension<SrcArgs...>; - - enum { - value = unsigned(dst::rank) == unsigned(src::rank) && - ( - // Compile time check that potential static dimensions match - ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) - ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) - : true) && - ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) - ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) - : true) && - ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) - ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) - : true) && - ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) - ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) - : true) && - ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) - ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) - : true) && - ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) - ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) - : true) && - ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) - ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) - : true) && - ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) - ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) - : true)) - }; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -struct ALL_t { - KOKKOS_INLINE_FUNCTION - constexpr const ALL_t& operator()() const { return *this; } - - KOKKOS_INLINE_FUNCTION - constexpr bool operator==(const ALL_t&) const { return true; } -}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -// TODO This alias declaration forces us to fully qualify ALL_t inside the -// Kokkos::Impl namespace to avoid deprecation warnings. Replace the -// fully-qualified name when we remove Kokkos::Impl::ALL_t. -using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = - Kokkos::ALL_t; -} // namespace Impl -#endif -} // namespace Kokkos - -namespace Kokkos { -namespace Impl { - template <class T> struct is_integral_extent_type { - enum : bool { value = std::is_same<T, Kokkos::ALL_t>::value ? 1 : 0 }; + enum : bool { value = std::is_same_v<T, Kokkos::ALL_t> ? 1 : 0 }; }; template <class iType> struct is_integral_extent_type<std::pair<iType, iType>> { - enum : bool { value = std::is_integral<iType>::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v<iType> ? 1 : 0 }; }; template <class iType> struct is_integral_extent_type<Kokkos::pair<iType, iType>> { - enum : bool { value = std::is_integral<iType>::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v<iType> ? 1 : 0 }; }; // Assuming '2 == initializer_list<iType>::size()' template <class iType> struct is_integral_extent_type<std::initializer_list<iType>> { - enum : bool { value = std::is_integral<iType>::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v<iType> ? 1 : 0 }; }; template <unsigned I, class... Args> @@ -339,8 +73,7 @@ struct is_integral_extent { enum : bool { value = is_integral_extent_type<type>::value }; - static_assert(value || std::is_integral<type>::value || - std::is_void<type>::value, + static_assert(value || std::is_integral_v<type> || std::is_void_v<type>, "subview argument must be either integral or integral extent"); }; @@ -358,16 +91,16 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> { enum { - value = (((CurrentArg == RankDest - 1) && - (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || - ((CurrentArg >= RankDest) && (std::is_integral<Arg>::value)) || - ((CurrentArg < RankDest) && - (std::is_same<Arg, Kokkos::ALL_t>::value)) || - ((CurrentArg == 0) && - (Kokkos::Impl::is_integral_extent_type<Arg>::value))) && - (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, - RankDest, RankSrc, CurrentArg + 1, - SubViewArgs...>::value) + value = + (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || + ((CurrentArg >= RankDest) && (std::is_integral_v<Arg>)) || + ((CurrentArg < RankDest) && (std::is_same_v<Arg, Kokkos::ALL_t>)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type<Arg>::value))) && + (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + RankDest, RankSrc, CurrentArg + 1, + SubViewArgs...>::value) }; }; @@ -375,7 +108,7 @@ template <int RankDest, int RankSrc, int CurrentArg, class Arg> struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg> { enum { - value = ((CurrentArg == RankDest - 1) || (std::is_integral<Arg>::value)) && + value = ((CurrentArg == RankDest - 1) || (std::is_integral_v<Arg>)) && (CurrentArg == RankSrc - 1) }; }; @@ -390,10 +123,9 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, enum { value = (((CurrentArg == RankSrc - RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || - ((CurrentArg < RankSrc - RankDest) && - (std::is_integral<Arg>::value)) || + ((CurrentArg < RankSrc - RankDest) && (std::is_integral_v<Arg>)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same<Arg, Kokkos::ALL_t>::value))) && + (std::is_same_v<Arg, Kokkos::ALL_t>))) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg + 1, SubViewArgs...>::value) @@ -404,8 +136,8 @@ template <int RankDest, int RankSrc, int CurrentArg, class Arg> struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg> { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same<Arg, Kokkos::ALL_t>::value)) + value = + ((CurrentArg == RankSrc - 1) && (std::is_same_v<Arg, Kokkos::ALL_t>)) }; }; @@ -638,7 +370,7 @@ struct SubviewExtents { const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); error(buffer + n, LEN - n, 0, 0, dim, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE(((void)dim; Kokkos::abort("Kokkos::subview bounds error"); @@ -657,21 +389,20 @@ struct SubviewExtents { template <size_t... DimArgs, class... Args> KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension<DimArgs...>& dim, Args... args) { - static_assert(DomainRank == sizeof...(DimArgs), ""); - static_assert(DomainRank == sizeof...(Args), ""); + static_assert(DomainRank == sizeof...(DimArgs)); + static_assert(DomainRank == sizeof...(Args)); // Verifies that all arguments, up to 8, are integral types, // integral extents, or don't exist. - static_assert( - RangeRank == unsigned(is_integral_extent<0, Args...>::value) + - unsigned(is_integral_extent<1, Args...>::value) + - unsigned(is_integral_extent<2, Args...>::value) + - unsigned(is_integral_extent<3, Args...>::value) + - unsigned(is_integral_extent<4, Args...>::value) + - unsigned(is_integral_extent<5, Args...>::value) + - unsigned(is_integral_extent<6, Args...>::value) + - unsigned(is_integral_extent<7, Args...>::value), - ""); + static_assert(RangeRank == + unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value)); if (RangeRank == 0) { m_length[0] = 0; @@ -708,149 +439,6 @@ struct SubviewExtents { namespace Kokkos { namespace Impl { - -/** \brief Given a value type and dimension generate the View data type */ -template <class T, class Dim> -struct ViewDataType; - -template <class T> -struct ViewDataType<T, ViewDimension<>> { - using type = T; -}; - -template <class T, size_t... Args> -struct ViewDataType<T, ViewDimension<0, Args...>> { - using type = typename ViewDataType<T*, ViewDimension<Args...>>::type; -}; - -template <class T, size_t N, size_t... Args> -struct ViewDataType<T, ViewDimension<N, Args...>> { - using type = typename ViewDataType<T, ViewDimension<Args...>>::type[N]; -}; - -/**\brief Analysis of View data type. - * - * Data type conforms to one of the following patterns : - * {const} value_type [][#][#][#] - * {const} value_type ***[#][#][#] - * Where the sum of counts of '*' and '[#]' is at most ten. - * - * Provide alias for ViewDimension<...> and value_type. - */ -template <class T> -struct ViewArrayAnalysis { - using value_type = T; - using const_value_type = std::add_const_t<T>; - using non_const_value_type = std::remove_const_t<T>; - using static_dimension = ViewDimension<>; - using dynamic_dimension = ViewDimension<>; - using dimension = ViewDimension<>; -}; - -template <class T, size_t N> -struct ViewArrayAnalysis<T[N]> { - private: - using nested = ViewArrayAnalysis<T>; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using static_dimension = - typename nested::static_dimension::template prepend<N>::type; - - using dynamic_dimension = typename nested::dynamic_dimension; - - using dimension = - typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; -}; - -template <class T> -struct ViewArrayAnalysis<T[]> { - private: - using nested = ViewArrayAnalysis<T>; - using nested_dimension = typename nested::dimension; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; -}; - -template <class T> -struct ViewArrayAnalysis<T*> { - private: - using nested = ViewArrayAnalysis<T>; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; -}; - -template <class DataType, class ArrayLayout, class ValueType> -struct ViewDataAnalysis { - private: - using array_analysis = ViewArrayAnalysis<DataType>; - - // ValueType is opportunity for partial specialization. - // Must match array analysis when this default template is used. - static_assert( - std::is_same<ValueType, - typename array_analysis::non_const_value_type>::value, - ""); - - public: - using specialize = void; // No specialization - - using dimension = typename array_analysis::dimension; - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - // Generate analogous multidimensional array specification type. - using type = typename ViewDataType<value_type, dimension>::type; - using const_type = typename ViewDataType<const_value_type, dimension>::type; - using non_const_type = - typename ViewDataType<non_const_value_type, dimension>::type; - - // Generate "flattened" multidimensional array specification type. - using scalar_array_type = type; - using const_scalar_array_type = const_type; - using non_const_scalar_array_type = non_const_type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <class Dimension, class Layout, class Enable = void> -struct ViewOffset { - using is_mapping_plugin = std::false_type; -}; - //---------------------------------------------------------------------------- // LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding template <class Dimension> @@ -1039,34 +627,60 @@ struct ViewOffset< m_dim.N5 * m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // FIXME: The version of clang-format in CI fails from maybe_unused + // clang-format off template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_dim.N0; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_dim.N0; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements Stride with [ rank ] value is + // the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1082,8 +696,8 @@ struct ViewOffset< return *this; } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1249,14 +863,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1327,34 +944,59 @@ struct ViewOffset< m_dim.N6; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 1; - if (0 < dimension_type::rank) { - s[1] = m_stride; + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + iType n = 1; + if constexpr (0 < dimension_type::rank) { + s[0] = n; + n *= m_stride; } - if (1 < dimension_type::rank) { - s[2] = s[1] * m_dim.N1; + if constexpr (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; } - if (2 < dimension_type::rank) { - s[3] = s[2] * m_dim.N2; + if constexpr (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; } - if (3 < dimension_type::rank) { - s[4] = s[3] * m_dim.N3; + if constexpr (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; } - if (4 < dimension_type::rank) { - s[5] = s[4] * m_dim.N4; + if constexpr (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; } - if (5 < dimension_type::rank) { - s[6] = s[5] * m_dim.N5; + if constexpr (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; } - if (6 < dimension_type::rank) { - s[7] = s[6] * m_dim.N6; + if constexpr (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; } - if (7 < dimension_type::rank) { - s[8] = s[7] * m_dim.N7; + if constexpr (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; } + return n; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1410,8 +1052,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1425,7 +1067,11 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride(Padding<TrivialScalarSize>::stride(arg_layout.dimension[0])) {} + m_stride( + arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG + ? arg_layout.stride + : Padding<TrivialScalarSize>::stride(arg_layout.dimension[0])) { + } template <class DimRHS> KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1678,42 +1324,58 @@ struct ViewOffset< m_dim.N1; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; n *= m_dim.N1; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = n; } - s[dimension_type::rank] = n * m_dim.N0; + return n * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -1730,8 +1392,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1888,14 +1550,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1937,8 +1602,8 @@ struct ViewOffset< } KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * - m_dim.N2 * m_dim.N1; + return m_stride == static_cast<size_type>(m_dim.N7) * m_dim.N6 * m_dim.N5 * + m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1; } /* Strides of dimensions */ @@ -1947,59 +1612,77 @@ struct ViewOffset< return m_dim.N7; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { - return m_dim.N7 * m_dim.N6; + return static_cast<size_type>(m_dim.N7) * m_dim.N6; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5; + return static_cast<size_type>(m_dim.N7) * m_dim.N6 * m_dim.N5; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + return static_cast<size_type>(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + return static_cast<size_type>(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + return static_cast<size_type>(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3 * m_dim.N2; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { size_type n = 1; - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = n; n *= m_dim.N7; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = n; n *= m_dim.N6; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = n; n *= m_dim.N5; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = n; n *= m_dim.N4; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = n; n *= m_dim.N3; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = n; n *= m_dim.N2; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = n; } - if (0 < dimension_type::rank) { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride; } - s[dimension_type::rank] = m_stride * m_dim.N0; + return m_stride * m_dim.N0; + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2056,13 +1739,31 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + + private: + template <unsigned TrivialScalarSize> + KOKKOS_FUNCTION constexpr size_type compute_stride( + const Kokkos::LayoutRight& arg_layout) { + if (arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG) + return arg_layout.stride; + size_type value = m_dim.N1; + if constexpr (dimension_type::rank > 2) value *= m_dim.N2; + if constexpr (dimension_type::rank > 3) value *= m_dim.N3; + if constexpr (dimension_type::rank > 4) value *= m_dim.N4; + if constexpr (dimension_type::rank > 5) value *= m_dim.N5; + if constexpr (dimension_type::rank > 6) value *= m_dim.N6; + if constexpr (dimension_type::rank > 7) value *= m_dim.N7; + return Padding<TrivialScalarSize>::stride(value); + } + + public: template <unsigned TrivialScalarSize> KOKKOS_INLINE_FUNCTION constexpr ViewOffset( std::integral_constant<unsigned, TrivialScalarSize> const&, @@ -2071,37 +1772,7 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride( - Padding<TrivialScalarSize>:: - stride(/* 2 <= rank */ - m_dim.N1 * - (dimension_type::rank == 2 - ? size_t(1) - : m_dim.N2 * - (dimension_type::rank == 3 - ? size_t(1) - : m_dim.N3 * - (dimension_type::rank == 4 - ? size_t(1) - : m_dim.N4 * - (dimension_type::rank == - 5 - ? size_t(1) - : m_dim.N5 * - (dimension_type:: - rank == - 6 - ? size_t( - 1) - : m_dim.N6 * - (dimension_type:: - rank == - 7 - ? size_t( - 1) - : m_dim - .N7)))))))) { - } + m_stride(compute_stride<TrivialScalarSize>(arg_layout)) {} template <class DimRHS> KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -2193,8 +1864,8 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2208,8 +1879,8 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2223,8 +1894,8 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2238,8 +1909,8 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2253,8 +1924,8 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2268,8 +1939,8 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2283,8 +1954,8 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2298,8 +1969,8 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2312,8 +1983,8 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2525,34 +2196,50 @@ struct ViewOffset<Dimension, Kokkos::LayoutStride, void> { return m_stride.S7; } - // Stride with [ rank ] value is the total length + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + // The version of clang-format in CI fails from maybe_unused + // clang-format off template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - if (0 < dimension_type::rank) { + KOKKOS_INLINE_FUNCTION iType + stride_fill([[maybe_unused]] iType* const s) const { + if constexpr (0 < dimension_type::rank) { s[0] = m_stride.S0; } - if (1 < dimension_type::rank) { + if constexpr (1 < dimension_type::rank) { s[1] = m_stride.S1; } - if (2 < dimension_type::rank) { + if constexpr (2 < dimension_type::rank) { s[2] = m_stride.S2; } - if (3 < dimension_type::rank) { + if constexpr (3 < dimension_type::rank) { s[3] = m_stride.S3; } - if (4 < dimension_type::rank) { + if constexpr (4 < dimension_type::rank) { s[4] = m_stride.S4; } - if (5 < dimension_type::rank) { + if constexpr (5 < dimension_type::rank) { s[5] = m_stride.S5; } - if (6 < dimension_type::rank) { + if constexpr (6 < dimension_type::rank) { s[6] = m_stride.S6; } - if (7 < dimension_type::rank) { + if constexpr (7 < dimension_type::rank) { s[7] = m_stride.S7; } - s[dimension_type::rank] = span(); + return span(); + } + // clang-format on + + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[dimension_type::rank] = stride_fill(s); } //---------------------------------------- @@ -2574,8 +2261,8 @@ struct ViewOffset<Dimension, Kokkos::LayoutStride, void> { } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -2689,9 +2376,9 @@ struct ViewDataHandle { template <class Traits> struct ViewDataHandle< Traits, - std::enable_if_t<(std::is_same<typename Traits::non_const_value_type, - typename Traits::value_type>::value && - std::is_void<typename Traits::specialize>::value && + std::enable_if_t<(std::is_same_v<typename Traits::non_const_value_type, + typename Traits::value_type> && + std::is_void_v<typename Traits::specialize> && Traits::memory_traits::is_atomic)>> { using value_type = typename Traits::value_type; using handle_type = typename Kokkos::Impl::AtomicViewDataHandle<Traits>; @@ -2713,11 +2400,10 @@ struct ViewDataHandle< template <class Traits> struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void<typename Traits::specialize>::value && - (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v<typename Traits::specialize> && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2737,11 +2423,10 @@ struct ViewDataHandle< template <class Traits> struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void<typename Traits::specialize>::value && - Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v<typename Traits::specialize> && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2776,11 +2461,10 @@ struct ViewDataHandle< template <class Traits> struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void<typename Traits::specialize>::value && - Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v<typename Traits::specialize> && + Traits::memory_traits::is_aligned && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2820,313 +2504,14 @@ struct ViewDataHandle< namespace Kokkos { namespace Impl { - -template <typename T> -inline bool is_zero_byte(const T& t) { - using comparison_type = std::conditional_t< - sizeof(T) % sizeof(long long int) == 0, long long int, - std::conditional_t< - sizeof(T) % sizeof(long int) == 0, long int, - std::conditional_t< - sizeof(T) % sizeof(int) == 0, int, - std::conditional_t<sizeof(T) % sizeof(short int) == 0, short int, - char>>>>; - const auto* const ptr = reinterpret_cast<const comparison_type*>(&t); - for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i) - if (ptr[i] != 0) return false; - return true; -} - -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template <class DeviceType, class ValueType, - bool IsScalar = std::is_scalar<ValueType>::value> -struct ViewValueFunctor; - -template <class DeviceType, class ValueType> -struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> { - using ExecSpace = typename DeviceType::execution_space; - - struct DestroyTag {}; - struct ConstructTag {}; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - template <class _ValueType = ValueType> - KOKKOS_INLINE_FUNCTION - std::enable_if_t<std::is_default_constructible<_ValueType>::value> - operator()(ConstructTag const&, const size_t i) const { - new (ptr + i) ValueType(); - } - - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { - (ptr + i)->~ValueType(); - } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) { - functor_instantiate_workaround(); - } - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) { - functor_instantiate_workaround(); - } - - template <typename Dummy = ValueType> - std::enable_if_t<std::is_trivial<Dummy>::value && - std::is_trivially_copy_assignable<ValueType>::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset< - ExecSpace, Kokkos::View<ValueType*, typename DeviceType::memory_space, - Kokkos::MemoryTraits<Kokkos::Unmanaged>>>( - space, - Kokkos::View<ValueType*, typename DeviceType::memory_space, - Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n), - value); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation<ConstructTag>(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template <typename Dummy = ValueType> - std::enable_if_t<!(std::is_trivial<Dummy>::value && - std::is_trivially_copy_assignable<ValueType>::value)> - construct_dispatch() { - parallel_for_implementation<ConstructTag>(); - } - - template <typename Tag> - void parallel_for_implementation() { - if (!space.in_parallel()) { - using PolicyType = - Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>, Tag>; - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - const std::string functor_name = - (std::is_same_v<Tag, DestroyTag> - ? "Kokkos::View::destruction [" + name + "]" - : "Kokkos::View::initialization [" + name + "]"); - Kokkos::Profiling::beginParallelFor( - functor_name, Kokkos::Profiling::Experimental::device_id(space), - &kpID); - } - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same<ExecSpace, Kokkos::Cuda>::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure( - *this, policy); - closure.execute(); - if (default_exec_space || std::is_same_v<Tag, DestroyTag>) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(Tag{}, i); - } - } - - void construct_shared_allocation() { construct_dispatch(); } - - void destroy_shared_allocation() { - parallel_for_implementation<DestroyTag>(); - } - - // This function is to ensure that the functor with DestroyTag is instantiated - // This is a workaround to avoid "cudaErrorInvalidDeviceFunction" error later - // when the function is queried with cudaFuncGetAttributes - void functor_instantiate_workaround() { -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) - if (false) { - parallel_for_implementation<DestroyTag>(); - } -#endif - } -}; - -template <class DeviceType, class ValueType> -struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template <typename Dummy = ValueType> - std::enable_if_t<std::is_trivial<Dummy>::value && - std::is_trivially_copy_assignable<Dummy>::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset< - ExecSpace, Kokkos::View<ValueType*, typename DeviceType::memory_space, - Kokkos::MemoryTraits<Kokkos::Unmanaged>>>( - space, - Kokkos::View<ValueType*, typename DeviceType::memory_space, - Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n), - value); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template <typename Dummy = ValueType> - std::enable_if_t<!(std::is_trivial<Dummy>::value && - std::is_trivially_copy_assignable<Dummy>::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - if (!space.in_parallel()) { - PolicyType policy(0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same<ExecSpace, Kokkos::Cuda>::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure( - *this, PolicyType(0, n)); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(i); - } - } - - void destroy_shared_allocation() {} -}; - //---------------------------------------------------------------------------- /** \brief View mapping for non-specialized data type and standard layout */ template <class Traits> class ViewMapping< - Traits, - std::enable_if_t<( - std::is_void<typename Traits::specialize>::value && - ViewOffset<typename Traits::dimension, typename Traits::array_layout, - void>::is_mapping_plugin::value)>> { + Traits, std::enable_if_t<(std::is_void_v<typename Traits::specialize> && + ViewOffset<typename Traits::dimension, + typename Traits::array_layout, + void>::is_mapping_plugin::value)>> { public: using offset_type = ViewOffset<typename Traits::dimension, typename Traits::array_layout, void>; @@ -3222,11 +2607,24 @@ class ViewMapping< return m_impl_offset.stride_7(); } + // Fill the target unbounded array s with the stride and the total spanned + // size. This method differs from stride_fill() in that it writes the total + // spanned size to the last index of the array. Preconditions: s must be an + // array of dimension_type::rank + 1 elements template <typename iType> KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { m_impl_offset.stride(s); } + // Fill the target unbounded array s with the stride. + // This method differs from stride() in that it does not write the total + // length to the last index of the array. Preconditions: s must be an array of + // dimension_type::rank elements + template <typename iType> + KOKKOS_INLINE_FUNCTION iType stride_fill(iType* const s) const { + return m_impl_offset.stride_fill(s); + } + //---------------------------------------- // Range span @@ -3256,28 +2654,26 @@ class ViewMapping< reference_type reference() const { return m_impl_handle[0]; } template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral<I0>::value && - // if layout is neither stride nor irregular, - // then just use the handle directly - !(std::is_same<typename Traits::array_layout, - Kokkos::LayoutStride>::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v<I0> && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same_v<typename Traits::array_layout, Kokkos::LayoutStride> || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[i0]; } template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral<I0>::value && - // if the layout is strided or irregular, then - // we have to use the offset - (std::is_same<typename Traits::array_layout, - Kokkos::LayoutStride>::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v<I0> && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same_v<typename Traits::array_layout, Kokkos::LayoutStride> || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[m_impl_offset(i0)]; } @@ -3356,7 +2752,7 @@ class ViewMapping< KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; //---------------------------------------- @@ -3401,10 +2797,12 @@ class ViewMapping< using memory_space = typename Traits::memory_space; static_assert( SpaceAccessibility<execution_space, memory_space>::accessible); - using value_type = typename Traits::value_type; - using functor_type = - ViewValueFunctor<Kokkos::Device<execution_space, memory_space>, - value_type>; + using device_type = Kokkos::Device<execution_space, memory_space>; + using value_type = typename Traits::value_type; + using functor_type = std::conditional_t< + alloc_prop::sequential_host_init, + ViewValueFunctorSequentialHostInit<device_type, value_type>, + ViewValueFunctor<device_type, value_type>>; using record_type = Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>; @@ -3468,29 +2866,34 @@ template <class DstTraits, class SrcTraits> class ViewMapping< DstTraits, SrcTraits, std::enable_if_t<( - !(std::is_same<typename SrcTraits::array_layout, LayoutStride>:: - value) && // Added to have a new specialization for SrcType of - // LayoutStride + !(std::is_same_v<typename SrcTraits::array_layout, + LayoutStride>)&& // Added to have a new + // specialization for + // SrcType of + // LayoutStride // default mappings - std::is_void<typename DstTraits::specialize>::value && - std::is_void<typename SrcTraits::specialize>::value && + std::is_void_v<typename DstTraits::specialize> && + std::is_void_v<typename SrcTraits::specialize> && ( // same layout - std::is_same<typename DstTraits::array_layout, - typename SrcTraits::array_layout>::value || + std::is_same_v<typename DstTraits::array_layout, + typename SrcTraits::array_layout> || // known layout - ((std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutStride>::value) && - (std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value))))>> { + ((std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v<typename SrcTraits:: + array_layout, + Kokkos::LayoutLeft> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>))))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3500,10 +2903,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same<typename DstTraits::value_type, - typename SrcTraits::value_type>::value || - std::is_same<typename DstTraits::value_type, - typename SrcTraits::const_value_type>::value + std::is_same_v<typename DstTraits::value_type, + typename SrcTraits::value_type> || + std::is_same_v<typename DstTraits::value_type, + typename SrcTraits::const_value_type> }; enum { @@ -3513,12 +2916,12 @@ class ViewMapping< }; enum { - is_assignable_layout = - std::is_same<typename DstTraits::array_layout, - typename SrcTraits::array_layout>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutStride>::value || - (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) + is_assignable_layout = std::is_same_v<typename DstTraits::array_layout, + typename SrcTraits::array_layout> || + std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutStride> || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1) }; public: @@ -3606,22 +3009,21 @@ class ViewMapping< template <class DstTraits, class SrcTraits> class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value && - std::is_void<typename DstTraits::specialize>::value && - std::is_void<typename SrcTraits::specialize>::value && - ( - // same layout - std::is_same<typename DstTraits::array_layout, - typename SrcTraits::array_layout>::value || - // known layout - (std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutStride>::value)))>> { + std::enable_if_t<(std::is_same_v<typename SrcTraits::array_layout, + Kokkos::LayoutStride> && + std::is_void_v<typename DstTraits::specialize> && + std::is_void_v<typename SrcTraits::specialize> && + ( + // same layout + std::is_same_v<typename DstTraits::array_layout, + typename SrcTraits::array_layout> || + // known layout + (std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutStride>)))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3631,10 +3033,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same<typename DstTraits::value_type, - typename SrcTraits::value_type>::value || - std::is_same<typename DstTraits::value_type, - typename SrcTraits::const_value_type>::value + std::is_same_v<typename DstTraits::value_type, + typename SrcTraits::value_type> || + std::is_same_v<typename DstTraits::value_type, + typename SrcTraits::const_value_type> }; enum { @@ -3665,8 +3067,7 @@ class ViewMapping< bool assignable = true; src.stride(strides); size_t exp_stride = 1; - if (std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutLeft>::value) { + if (std::is_same_v<typename DstTraits::array_layout, Kokkos::LayoutLeft>) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { @@ -3674,8 +3075,8 @@ class ViewMapping< break; } } - } else if (std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutRight>::value) { + } else if (std::is_same_v<typename DstTraits::array_layout, + Kokkos::LayoutRight>) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(src.Rank - i); if (strides[src.Rank - 1 - i] != exp_stride) { @@ -3768,16 +3169,16 @@ struct SubViewDataTypeImpl<void, ValueType, Kokkos::Experimental::Extents<>> { }; /* for integral args, subview doesn't have that dimension */ -template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class Integral, +template <class ValueType, size_t Ext, size_t... Exts, class Integral, class... Args> struct SubViewDataTypeImpl< - std::enable_if_t<std::is_integral<std::decay_t<Integral>>::value>, - ValueType, Kokkos::Experimental::Extents<Ext, Exts...>, Integral, Args...> + std::enable_if_t<std::is_integral_v<std::decay_t<Integral>>>, ValueType, + Kokkos::Experimental::Extents<Ext, Exts...>, Integral, Args...> : SubViewDataTypeImpl<void, ValueType, Kokkos::Experimental::Extents<Exts...>, Args...> {}; /* for ALL slice, subview has the same dimension */ -template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class... Args> +template <class ValueType, size_t Ext, size_t... Exts, class... Args> struct SubViewDataTypeImpl<void, ValueType, Kokkos::Experimental::Extents<Ext, Exts...>, Kokkos::ALL_t, Args...> @@ -3788,7 +3189,7 @@ struct SubViewDataTypeImpl<void, ValueType, * static sizes */ /* Since we don't allow interleaving of dynamic and static extents, make all of * the dimensions to the left dynamic */ -template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class PairLike, +template <class ValueType, size_t Ext, size_t... Exts, class PairLike, class... Args> struct SubViewDataTypeImpl< std::enable_if_t<is_pair_like<PairLike>::value>, ValueType, @@ -3804,13 +3205,13 @@ struct SubViewDataType : SubViewDataTypeImpl<void, ValueType, Exts, Args...> {}; template <class SrcTraits, class... Args> class ViewMapping< - std::enable_if_t<(std::is_void<typename SrcTraits::specialize>::value && - (std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value))>, + std::enable_if_t<( + std::is_void_v<typename SrcTraits::specialize> && + (std::is_same_v<typename SrcTraits::array_layout, Kokkos::LayoutLeft> || + std::is_same_v<typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v<typename SrcTraits::array_layout, + Kokkos::LayoutStride>))>, SrcTraits, Args...> { private: static_assert(SrcTraits::rank == sizeof...(Args), @@ -3866,14 +3267,14 @@ class ViewMapping< // OutputRank 1 or 2, InputLayout Left, Interval 0 // because single stride one or second index has a stride. (rank <= 2 && R0 && - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value) // replace with input rank + std::is_same_v<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>) // replace with input rank || // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] // because single stride one or second index has a stride. (rank <= 2 && R0_rev && - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value) // replace input rank + std::is_same_v<typename SrcTraits::array_layout, + Kokkos::LayoutRight>) // replace input rank ), typename SrcTraits::array_layout, Kokkos::LayoutStride>; @@ -3896,7 +3297,7 @@ class ViewMapping< template <class MemoryTraits> struct apply { - static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, ""); + static_assert(Kokkos::is_memory_traits<MemoryTraits>::value); using traits_type = Kokkos::ViewTraits<data_type, array_layout, @@ -3945,110 +3346,86 @@ class ViewMapping< namespace Kokkos { namespace Impl { -template <unsigned, class MapType> -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) { - return true; +template <class Map, class... Indices, std::size_t... Enumerate> +KOKKOS_FUNCTION bool within_range(Map const& map, + std::index_sequence<Enumerate...>, + Indices... indices) { + return (((std::size_t)indices < map.extent(Enumerate)) && ...); } -template <unsigned R, class MapType, class iType, class... Args> -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map, - const iType& i, - Args... args) { - return (size_t(i) < map.extent(R)) && - view_verify_operator_bounds<R + 1>(map, args...); +template <class... Indices> +KOKKOS_FUNCTION constexpr char* append_formatted_multidimensional_index( + char* dest, Indices... indices) { + char* d = dest; + strcat(d, "["); + ( + [&] { + d += strlen(d); + to_chars_i(d, + d + 20, // 20 digits ought to be enough + indices); + strcat(d, ","); + }(), + ...); + d[strlen(d) - 1] = ']'; // overwrite trailing comma + return dest; } -template <unsigned, class MapType> -inline void view_error_operator_bounds(char*, int, const MapType&) {} - -template <unsigned R, class MapType, class iType, class... Args> -inline void view_error_operator_bounds(char* buf, int len, const MapType& map, - const iType& i, Args... args) { - const int n = snprintf( - buf, len, " %ld < %ld %c", static_cast<unsigned long>(i), - static_cast<unsigned long>(map.extent(R)), (sizeof...(Args) ? ',' : ')')); - view_error_operator_bounds<R + 1>(buf + n, len - n, map, args...); +template <class Map, class... Indices, std::size_t... Enumerate> +KOKKOS_FUNCTION void print_extents(char* dest, Map const& map, + std::index_sequence<Enumerate...>) { + append_formatted_multidimensional_index(dest, map.extent(Enumerate)...); } -/* Check #3: is the View managed as determined by the MemoryTraits? */ -template <class MapType, bool is_managed = (MapType::is_managed != 0)> -struct OperatorBoundsErrorOnDevice; - -template <class MapType> -struct OperatorBoundsErrorOnDevice<MapType, false> { - KOKKOS_INLINE_FUNCTION - static void run(MapType const&) { Kokkos::abort("View bounds error"); } -}; - -template <class MapType> -struct OperatorBoundsErrorOnDevice<MapType, true> { - KOKKOS_INLINE_FUNCTION - static void run(MapType const& map) { - SharedAllocationHeader const* const header = - SharedAllocationHeader::get_header( - static_cast<void const*>(map.data())); - char const* const label = header->label(); - enum { LEN = 128 }; - char msg[LEN]; - char const* const first_part = "View bounds error of view "; - char* p = msg; - char* const end = msg + LEN - 1; - for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - *p = '\0'; - Kokkos::abort(msg); - } -}; - -/* Check #2: does the ViewMapping have the printable_label_typedef defined? - See above that only the non-specialized standard-layout ViewMapping has - this defined by default. - The existence of this alias indicates the existence of MapType::is_managed - */ template <class T> using printable_label_typedef_t = typename T::printable_label_typedef; -template <class Map> -KOKKOS_FUNCTION - std::enable_if_t<!is_detected<printable_label_typedef_t, Map>::value> - operator_bounds_error_on_device(Map const&) { - Kokkos::abort("View bounds error"); -} - -template <class Map> -KOKKOS_FUNCTION - std::enable_if_t<is_detected<printable_label_typedef_t, Map>::value> - operator_bounds_error_on_device(Map const& map) { - OperatorBoundsErrorOnDevice<Map>::run(map); -} - template <class MemorySpace, class ViewType, class MapType, class... Args> KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds( Kokkos::Impl::ViewTracker<ViewType> const& tracker, const MapType& map, Args... args) { - if (!view_verify_operator_bounds<0>(map, args...)) { + if (!within_range(map, std::make_index_sequence<sizeof...(Args)>(), + args...)) { + char err[256] = ""; + strcat(err, "Kokkos::View ERROR: out of bounds access"); + strcat(err, " label=(\""); KOKKOS_IF_ON_HOST( - (enum {LEN = 1024}; char buffer[LEN]; - const std::string label = - tracker.m_tracker.template get_label<MemorySpace>(); - int n = snprintf(buffer, LEN, "View bounds error of view %s (", - label.c_str()); - view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if its not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ if (tracker.m_tracker.has_record()) { - operator_bounds_error_on_device(map); - } else { Kokkos::abort("View bounds error"); })) + strncat(err, tracker.m_tracker.template get_label<void>().c_str(), + 128); + } else { strcat(err, "**UNMANAGED**"); }) + KOKKOS_IF_ON_DEVICE([&] { + // Check #1: is there a SharedAllocationRecord? (we won't use it, but + // if its not there then there isn't a corresponding + // SharedAllocationHeader containing a label). This check should cover + // the case of Views that don't have the Unmanaged trait but were + // initialized by pointer. + if (!tracker.m_tracker.has_record()) { + strcat(err, "**UNMANAGED**"); + return; + } + // Check #2: does the ViewMapping have the printable_label_typedef + // defined? See above that only the non-specialized standard-layout + // ViewMapping has this defined by default. The existence of this + // alias indicates the existence of MapType::is_managed + if constexpr (is_detected_v<printable_label_typedef_t, MapType>) { + // Check #3: is the View managed as determined by the MemoryTraits? + if constexpr (MapType::is_managed != 0) { + SharedAllocationHeader const* const header = + SharedAllocationHeader::get_header( + static_cast<void const*>(map.data())); + char const* const label = header->label(); + strcat(err, label); + return; + } + strcat(err, "**UNAVAILABLE**"); + } + }();) + strcat(err, "\") with indices "); + append_formatted_multidimensional_index(err, args...); + strcat(err, " but extents "); + print_extents(err, map, std::make_index_sequence<sizeof...(Args)>()); + Kokkos::abort(err); } } diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/packages/kokkos/core/src/View/Kokkos_ViewTracker.hpp similarity index 100% rename from packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewTracker.hpp diff --git a/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp b/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5eddfc68e0704b5ea9d369e6e94913476b0edbe8 --- /dev/null +++ b/packages/kokkos/core/src/View/Kokkos_ViewTraits.hpp @@ -0,0 +1,457 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include <Kokkos_Macros.hpp> +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWTRAITS_HPP +#define KOKKOS_VIEWTRAITS_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <View/Hooks/Kokkos_ViewHooks.hpp> +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include <View/MDSpan/Kokkos_MDSpan_Layout.hpp> +#include <View/MDSpan/Kokkos_MDSpan_Accessor.hpp> +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ALL_t { + KOKKOS_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; +} // namespace Impl +#endif + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp declare target +#endif + +inline constexpr Kokkos::ALL_t ALL{}; + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp end declare target +#endif + +namespace Impl { + +template <class DataType> +struct ViewArrayAnalysis; + +template <class DataType, class ArrayLayout, + typename ValueType = + typename ViewArrayAnalysis<DataType>::non_const_value_type> +struct ViewDataAnalysis; + +template <class, class...> +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template <typename IntType> +constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral_v<IntType>, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template <typename View> +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { + (void)(label); + + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { + KOKKOS_IF_ON_HOST( + const std::string message = + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + + std::to_string(num_passed_args) + + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " + "mismatched number of arguments.");) + } + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template <class ViewType, int Traits = 0> +struct ViewUniformType; +} +} // namespace Kokkos + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template <class Traits, class Enabled = void> +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template <class Traits> +struct MDSpanViewTraits<Traits, std::void_t<typename LayoutFromArrayLayout< + typename Traits::array_layout>::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType<index_type, + typename Traits::data_type>::type; + using mdspan_layout_type = + typename LayoutFromArrayLayout<typename Traits::array_layout>::type; + using accessor_type = + SpaceAwareAccessor<typename Traits::memory_space, + Kokkos::default_accessor<typename Traits::value_type>>; + using mdspan_type = mdspan<typename Traits::value_type, extents_type, + mdspan_layout_type, accessor_type>; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template <class DataType, class... Properties> +struct ViewTraits; + +template <> +struct ViewTraits<void> { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; + using hooks_policy = void; +}; + +template <class... Prop> +struct ViewTraits<void, void, Prop...> { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits<void, Prop...>::execution_space; + using memory_space = typename ViewTraits<void, Prop...>::memory_space; + using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; + using array_layout = typename ViewTraits<void, Prop...>::array_layout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; + using hooks_policy = typename ViewTraits<void, Prop...>::hooks_policy; +}; + +template <class HooksPolicy, class... Prop> +struct ViewTraits< + std::enable_if_t<Kokkos::Experimental::is_hooks_policy<HooksPolicy>::value>, + HooksPolicy, Prop...> { + using execution_space = typename ViewTraits<void, Prop...>::execution_space; + using memory_space = typename ViewTraits<void, Prop...>::memory_space; + using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; + using array_layout = typename ViewTraits<void, Prop...>::array_layout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; + using hooks_policy = HooksPolicy; +}; + +template <class ArrayLayout, class... Prop> +struct ViewTraits<std::enable_if_t<Kokkos::is_array_layout<ArrayLayout>::value>, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits<void, Prop...>::execution_space; + using memory_space = typename ViewTraits<void, Prop...>::memory_space; + using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; + using hooks_policy = typename ViewTraits<void, Prop...>::hooks_policy; +}; + +template <class Space, class... Prop> +struct ViewTraits<std::enable_if_t<Kokkos::is_space<Space>::value>, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same_v<typename ViewTraits<void, Prop...>::execution_space, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::memory_space, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::HostMirrorSpace, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::array_layout, + void>, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror<Space>::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; + using hooks_policy = typename ViewTraits<void, Prop...>::hooks_policy; +}; + +template <class MemoryTraits, class... Prop> +struct ViewTraits< + std::enable_if_t<Kokkos::is_memory_traits<MemoryTraits>::value>, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same_v<typename ViewTraits<void, Prop...>::execution_space, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::memory_space, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::array_layout, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::memory_traits, + void> && + std::is_same_v<typename ViewTraits<void, Prop...>::hooks_policy, + void>, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; + using hooks_policy = void; +}; + +template <class DataType, class... Properties> +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits<void, Properties...>; + + using ExecutionSpace = + std::conditional_t<!std::is_void_v<typename prop::execution_space>, + typename prop::execution_space, + Kokkos::DefaultExecutionSpace>; + + using MemorySpace = + std::conditional_t<!std::is_void_v<typename prop::memory_space>, + typename prop::memory_space, + typename ExecutionSpace::memory_space>; + + using ArrayLayout = + std::conditional_t<!std::is_void_v<typename prop::array_layout>, + typename prop::array_layout, + typename ExecutionSpace::array_layout>; + + using HostMirrorSpace = std::conditional_t< + !std::is_void_v<typename prop::HostMirrorSpace>, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror<ExecutionSpace>::Space>; + + using MemoryTraits = + std::conditional_t<!std::is_void_v<typename prop::memory_traits>, + typename prop::memory_traits, + typename Kokkos::MemoryManaged>; + + using HooksPolicy = + std::conditional_t<!std::is_void_v<typename prop::hooks_policy>, + typename prop::hooks_policy, + Kokkos::Experimental::DefaultViewHooks>; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis<DataType, ArrayLayout>; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = std::conditional_t< + std::is_void_v<typename data_analysis::specialize>, + typename prop::specialize, + typename data_analysis::specialize>; /* mapping specialization tag */ + + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device<ExecutionSpace, MemorySpace>; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + using hooks_policy = HooksPolicy; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same_v<MemorySpace, HostSpace> }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Impl { +template <class ValueType, class TypeList> +struct TypeListToViewTraits; + +template <class ValueType, class... Properties> +struct TypeListToViewTraits<ValueType, Kokkos::Impl::type_list<Properties...>> { + using type = ViewTraits<ValueType, Properties...>; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template <class D, class... P> +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list<P...>; + using memory_traits = typename ViewTraits<D, P...>::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first<memory_traits, + type_list_in>::type; + using new_memory_traits = + Kokkos::MemoryTraits<memory_traits::impl_value & ~Kokkos::Aligned>; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list<new_memory_traits>>::type; + + public: + using type = typename TypeListToViewTraits<D, new_type_list>::type; +}; +} // namespace Impl + +} /* namespace Kokkos */ + +#endif /* KOKKOS_VIEWTRAITS_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp similarity index 88% rename from packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp rename to packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp index 7de2869a0d884c5f641ecfbbea345c27bb664df6..1e476132858c10e951919048a6d074e15af03676 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewUniformType.hpp @@ -24,11 +24,14 @@ namespace Impl { template <class ScalarType, int Rank> struct ViewScalarToDataType { using type = typename ViewScalarToDataType<ScalarType, Rank - 1>::type *; + using const_type = + typename ViewScalarToDataType<ScalarType, Rank - 1>::const_type *; }; template <class ScalarType> struct ViewScalarToDataType<ScalarType, 0> { - using type = ScalarType; + using type = ScalarType; + using const_type = const ScalarType; }; template <class LayoutType, int Rank> @@ -49,12 +52,13 @@ struct ViewUniformLayout<Kokkos::LayoutRight, 1> { template <class ViewType, int Traits> struct ViewUniformType { using data_type = typename ViewType::data_type; - using const_data_type = std::add_const_t<typename ViewType::data_type>; + using const_data_type = typename ViewType::const_data_type; using runtime_data_type = typename ViewScalarToDataType<typename ViewType::value_type, ViewType::rank>::type; - using runtime_const_data_type = typename ViewScalarToDataType< - std::add_const_t<typename ViewType::value_type>, ViewType::rank>::type; + using runtime_const_data_type = + typename ViewScalarToDataType<typename ViewType::value_type, + ViewType::rank>::const_type; using array_layout = typename ViewUniformLayout<typename ViewType::array_layout, diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..929569ee1805eaa6ad148db74cb6a6a2555b0fb3 --- /dev/null +++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Accessor.hpp @@ -0,0 +1,411 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_MDSPAN_ACCESSOR_HPP +#define KOKKOS_MDSPAN_ACCESSOR_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <desul/atomics.hpp> + +namespace Kokkos { + +// For now use the accessors in Impl namespace, as an +// implementation detail for rebasing View on mdspan +namespace Impl { + +template <class MemorySpace, class NestedAccessor> +struct SpaceAwareAccessor { + // Part of Accessor Requirements + using element_type = typename NestedAccessor::element_type; + using reference = typename NestedAccessor::reference; + using data_handle_type = typename NestedAccessor::data_handle_type; + using offset_policy = + SpaceAwareAccessor<MemorySpace, typename NestedAccessor::offset_policy>; + + // Specific to SpaceAwareAccessor + using memory_space = MemorySpace; + using nested_accessor_type = NestedAccessor; + + static_assert(is_memory_space_v<memory_space>); + + KOKKOS_DEFAULTED_FUNCTION + constexpr SpaceAwareAccessor() = default; + + template < + class OtherMemorySpace, class OtherNestedAccessorType, + std::enable_if_t< + MemorySpaceAccess<MemorySpace, OtherMemorySpace>::assignable && + std::is_constructible_v<NestedAccessor, OtherNestedAccessorType>, + int> = 0> + KOKKOS_FUNCTION constexpr SpaceAwareAccessor( + const SpaceAwareAccessor<OtherMemorySpace, OtherNestedAccessorType>& + other) noexcept + : nested_acc(other.nested_acc) {} + + KOKKOS_FUNCTION + SpaceAwareAccessor(const NestedAccessor& acc) : nested_acc(acc) {} + + KOKKOS_FUNCTION + explicit operator NestedAccessor() const { return nested_acc; } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const noexcept { + Kokkos::Impl::runtime_check_memory_access_violation<memory_space>( + "Kokkos::SpaceAwareAccessor ERROR: attempt to access inaccessible " + "memory space"); + return nested_acc.access(p, i); + } + + KOKKOS_FUNCTION + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { + return nested_acc.offset(p, i); + } + + // Canonical way for accessing nested accessor see ISO C++ + // [linalg.scaled.scaledaccessor] + KOKKOS_FUNCTION + constexpr const NestedAccessor& nested_accessor() const noexcept { + return nested_acc; + } + + private: +// We either compile with our custom mdspan impl +// in which case we discover inside it whether no_unique_address +// works, or we use C++23 in which case it better be available +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor nested_acc; + template <class, class> + friend struct SpaceAwareAccessor; +}; + +template <class NestedAccessor> +struct SpaceAwareAccessor<AnonymousSpace, NestedAccessor> { + // Part of Accessor Requirements + using element_type = typename NestedAccessor::element_type; + using reference = typename NestedAccessor::reference; + using data_handle_type = typename NestedAccessor::data_handle_type; + + using offset_policy = + SpaceAwareAccessor<AnonymousSpace, + typename NestedAccessor::offset_policy>; + + // Specific to SpaceAwareAccessor + using memory_space = AnonymousSpace; + using nested_accessor_type = NestedAccessor; + + KOKKOS_DEFAULTED_FUNCTION + constexpr SpaceAwareAccessor() = default; + + template <class OtherMemorySpace, class OtherNestedAccessorType, + std::enable_if_t<std::is_constructible_v<NestedAccessor, + OtherNestedAccessorType>, + int> = 0> + KOKKOS_FUNCTION constexpr SpaceAwareAccessor( + const SpaceAwareAccessor<OtherMemorySpace, OtherNestedAccessorType>& + other) noexcept + : nested_acc(other.nested_acc) {} + + KOKKOS_FUNCTION + SpaceAwareAccessor(const NestedAccessor& acc) : nested_acc(acc) {} + + KOKKOS_FUNCTION + explicit operator NestedAccessor() const { return nested_acc; } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const noexcept { + return nested_acc.access(p, i); + } + + KOKKOS_FUNCTION + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { + return nested_acc.offset(p, i); + } + + // Canonical way for accessing nested accessor see ISO C++ + // [linalg.scaled.scaledaccessor] + KOKKOS_FUNCTION + constexpr const NestedAccessor& nested_accessor() const noexcept { + return nested_acc; + } + + private: +// We either compile with our custom mdspan impl +// in which case we discover inside it whether no_unique_address +// works, or we use C++23 in which case it better be available +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor nested_acc; + template <class, class> + friend struct SpaceAwareAccessor; +}; + +// Like atomic_accessor_relaxed proposed for ISO C++26 but with +// defaulted memory scope - similar to how desul's AtomicRef has a memory scope +template <class ElementType, class MemoryScope = desul::MemoryScopeDevice> +struct AtomicAccessorRelaxed { + using element_type = ElementType; + using reference = + desul::AtomicRef<ElementType, desul::MemoryOrderRelaxed, MemoryScope>; + using data_handle_type = ElementType*; + using offset_policy = AtomicAccessorRelaxed; + + KOKKOS_DEFAULTED_FUNCTION + AtomicAccessorRelaxed() = default; + + // Conversions from non-const to const element type + template <class OtherElementType, + std::enable_if_t<std::is_convertible_v< + OtherElementType (*)[], element_type (*)[]>>* = nullptr> + KOKKOS_FUNCTION constexpr AtomicAccessorRelaxed( + Kokkos::default_accessor<OtherElementType>) noexcept {} + + template <class OtherElementType, + std::enable_if_t<std::is_convertible_v< + OtherElementType (*)[], element_type (*)[]>>* = nullptr> + KOKKOS_FUNCTION constexpr AtomicAccessorRelaxed( + AtomicAccessorRelaxed<OtherElementType, MemoryScope>) noexcept {} + + template <class OtherElementType, + std::enable_if_t<std::is_convertible_v< + element_type (*)[], OtherElementType (*)[]>>* = nullptr> + KOKKOS_FUNCTION explicit operator default_accessor<OtherElementType>() const { + return default_accessor<OtherElementType>{}; + } + + KOKKOS_FUNCTION + reference access(data_handle_type p, size_t i) const noexcept { + return reference(p[i]); + } + + KOKKOS_FUNCTION + data_handle_type offset(data_handle_type p, size_t i) const noexcept { + return p + i; + } +}; + +//===================================================================== +//============= Reference Counted Accessor and DataHandle ============= +//===================================================================== + +template <class ElementType, class MemorySpace> +class ReferenceCountedDataHandle { + public: + using value_type = ElementType; + using pointer = value_type*; + using reference = value_type&; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + ReferenceCountedDataHandle() = default; + + // this only ever works on host + explicit ReferenceCountedDataHandle(SharedAllocationRecord<void, void>* rec) { + m_tracker.assign_allocated_record_to_uninitialized(rec); + m_handle = static_cast<pointer>(get_record()->data()); + } + + KOKKOS_FUNCTION + ReferenceCountedDataHandle(const SharedAllocationTracker& tracker, + pointer data_handle) + : m_tracker(tracker), m_handle(data_handle) {} + + // unmanaged ctor + template <class OtherElementType, + class = std::enable_if_t<std::is_convertible_v< + OtherElementType (*)[], value_type (*)[]>>> + KOKKOS_FUNCTION ReferenceCountedDataHandle(OtherElementType* ptr) + : m_tracker(), m_handle(ptr) {} + + // subview ctor + template <class OtherElementType, + class = std::enable_if_t<std::is_convertible_v< + OtherElementType (*)[], value_type (*)[]>>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other, OtherElementType* ptr) + : m_tracker(other.m_tracker), m_handle(ptr) {} + + // converting ctor + template <class OtherElementType, + class = std::enable_if_t<std::is_convertible_v< + OtherElementType (*)[], value_type (*)[]>>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle<OtherElementType, memory_space>& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + template < + class OtherElementType, class OtherSpace, + class = std::enable_if_t< + std::is_convertible_v<OtherElementType (*)[], value_type (*)[]> && + (std::is_same_v<OtherSpace, AnonymousSpace> || + std::is_same_v<memory_space, AnonymousSpace>)>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle<OtherElementType, OtherSpace>& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + KOKKOS_FUNCTION + pointer get() const noexcept { return m_handle; } + KOKKOS_FUNCTION + explicit operator pointer() const noexcept { return m_handle; } + + bool has_record() const { return m_tracker.has_record(); } + auto* get_record() const { return m_tracker.get_record<memory_space>(); } + int use_count() const noexcept { return m_tracker.use_count(); } + + std::string get_label() const { return m_tracker.get_label<memory_space>(); } + KOKKOS_FUNCTION + const SharedAllocationTracker& tracker() const noexcept { return m_tracker; } + + KOKKOS_FUNCTION + friend bool operator==(const ReferenceCountedDataHandle& lhs, + const value_type* rhs) { + return lhs.m_handle == rhs; + } + + KOKKOS_FUNCTION + friend bool operator==(const value_type* lhs, + const ReferenceCountedDataHandle& rhs) { + return lhs == rhs.m_handle; + } + + private: + template <class OtherElementType, class OtherSpace> + friend class ReferenceCountedDataHandle; + + template <class OtherElementType, class OtherSpace, class NestedAccessor> + friend class ReferenceCountedAccessor; + + SharedAllocationTracker m_tracker; + pointer m_handle = nullptr; +}; + +template <class ElementType, class MemorySpace, class NestedAccessor> +class ReferenceCountedAccessor; + +template <class Accessor> +struct IsReferenceCountedAccessor : std::false_type {}; + +template <class ElementType, class MemorySpace, class NestedAccessor> +struct IsReferenceCountedAccessor< + ReferenceCountedAccessor<ElementType, MemorySpace, NestedAccessor>> + : std::true_type {}; + +template <class ElementType, class MemorySpace, class NestedAccessor> +class ReferenceCountedAccessor { + public: + using element_type = ElementType; + using data_handle_type = ReferenceCountedDataHandle<ElementType, MemorySpace>; + using reference = typename NestedAccessor::reference; + using offset_policy = + ReferenceCountedAccessor<ElementType, MemorySpace, + typename NestedAccessor::offset_policy>; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + constexpr ReferenceCountedAccessor() noexcept = default; + + template < + class OtherElementType, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v<OtherElementType (*)[], element_type (*)[]> && + std::is_constructible_v<NestedAccessor, OtherNestedAccessor>>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor<OtherElementType, MemorySpace, + OtherNestedAccessor>&) {} + + template < + class OtherElementType, class OtherSpace, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v<OtherElementType (*)[], element_type (*)[]> && + (std::is_same_v<OtherSpace, AnonymousSpace> || + std::is_same_v<memory_space, AnonymousSpace>)&&std:: + is_constructible_v<NestedAccessor, OtherNestedAccessor>>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor<OtherElementType, OtherSpace, + OtherNestedAccessor>&) {} + + template <class OtherElementType, + class = std::enable_if_t<std::is_convertible_v< + OtherElementType (*)[], element_type (*)[]>>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const default_accessor<OtherElementType>&) {} + + template <class DstAccessor, + typename = std::enable_if_t< + !IsReferenceCountedAccessor<DstAccessor>::value && + std::is_convertible_v<NestedAccessor, DstAccessor>>> + KOKKOS_FUNCTION operator DstAccessor() const { + return m_nested_acc; + } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const { + return m_nested_acc.access(p.get(), i); + } + + KOKKOS_FUNCTION + constexpr data_handle_type offset(data_handle_type p, size_t i) const { + return data_handle_type(p, m_nested_acc.offset(p.get(), i)); + } + + KOKKOS_FUNCTION + constexpr auto nested_accessor() const { return m_nested_acc; } + + private: +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor m_nested_acc; +}; + +template <class ElementType, class MemorySpace> +using CheckedReferenceCountedAccessor = + SpaceAwareAccessor<MemorySpace, + ReferenceCountedAccessor<ElementType, MemorySpace, + default_accessor<ElementType>>>; + +template <class ElementType, class MemorySpace, + class MemoryScope = desul::MemoryScopeDevice> +using CheckedRelaxedAtomicAccessor = + SpaceAwareAccessor<MemorySpace, AtomicAccessorRelaxed<ElementType>>; + +template <class ElementType, class MemorySpace, + class MemoryScope = desul::MemoryScopeDevice> +using CheckedReferenceCountedRelaxedAtomicAccessor = SpaceAwareAccessor< + MemorySpace, ReferenceCountedAccessor<ElementType, MemorySpace, + AtomicAccessorRelaxed<ElementType>>>; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp index 3846b52d2396cb58497379cbb851e71af1902685..29d1e00adfc2d1a539e46dbfeafe4bda40831ecf 100644 --- a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp +++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Extents.hpp @@ -37,9 +37,6 @@ struct ViewDimension; template <class T, class Dim> struct ViewDataType; -} // namespace Kokkos::Impl - -namespace Kokkos::Experimental::Impl { // A few things to note -- // - mdspan allows for 0-rank extents similarly to View, so we don't need @@ -106,6 +103,20 @@ struct DataTypeFromExtents { // Will cause a compile error if it is malformed (i.e. dynamic after static) using type = typename ::Kokkos::Impl::ViewDataType<T, dimension_type>::type; }; -} // namespace Kokkos::Experimental::Impl + +template <class Extents, class VM, std::size_t... Indices> +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping_impl( + const VM &view_mapping, std::index_sequence<Indices...>) { + return Extents{view_mapping.extent(Indices)...}; +} + +template <class Extents, class VM> +constexpr KOKKOS_INLINE_FUNCTION auto extents_from_view_mapping( + const VM &view_mapping) { + static_assert(Extents::rank() == VM::Rank); + return extents_from_view_mapping_impl<Extents>( + view_mapping, std::make_index_sequence<Extents::rank()>{}); +} +} // namespace Kokkos::Impl #endif // KOKKOS_EXPERIMENTAL_MDSPAN_EXTENTS_HPP diff --git a/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f990d158bfadacb4d5062df6c267284ffabd139b --- /dev/null +++ b/packages/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -0,0 +1,221 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP +#define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP + +#include "Kokkos_MDSpan_Extents.hpp" +#include <View/Kokkos_ViewDataAnalysis.hpp> + +// The difference between a legacy Kokkos array layout and an +// mdspan layout is that the array layouts can have state, but don't have the +// nested mapping. This file provides interoperability helpers. + +namespace Kokkos::Impl { + +template <class ArrayLayout> +struct LayoutFromArrayLayout; + +template <> +struct LayoutFromArrayLayout<Kokkos::LayoutLeft> { + using type = Kokkos::Experimental::layout_left_padded<dynamic_extent>; +}; + +template <> +struct LayoutFromArrayLayout<Kokkos::LayoutRight> { + using type = Kokkos::Experimental::layout_right_padded<dynamic_extent>; +}; + +template <> +struct LayoutFromArrayLayout<Kokkos::LayoutStride> { + using type = layout_stride; +}; + +template <class ArrayLayout, class MDSpanType> +KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( + const typename MDSpanType::mapping_type &mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + constexpr auto rank = extents_type::rank(); + const auto &ext = mapping.extents(); + + static_assert(rank <= ARRAY_LAYOUT_MAX_RANK, + "Unsupported rank for mdspan (must be <= 8)"); + + if constexpr (std::is_same_v<ArrayLayout, LayoutStride>) { + return Kokkos::LayoutStride{ + rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 0 ? mapping.stride(0) : 0, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? mapping.stride(1) : 0, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? mapping.stride(2) : 0, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? mapping.stride(3) : 0, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? mapping.stride(4) : 0, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? mapping.stride(5) : 0, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? mapping.stride(6) : 0, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? mapping.stride(7) : 0, + }; + } else { + ArrayLayout layout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 4 ? ext.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + + if constexpr (rank > 1 && + std::is_same_v<typename mapping_type::layout_type, + Kokkos::Experimental::layout_left_padded< + dynamic_extent>>) { + layout.stride = mapping.stride(1); + } + if constexpr (std::is_same_v<typename mapping_type::layout_type, + Kokkos::Experimental::layout_right_padded< + dynamic_extent>>) { + if constexpr (rank == 2) { + layout.stride = mapping.stride(0); + } + if constexpr (rank > 2) { + if (mapping.stride(rank - 2) != mapping.extents().extent(rank - 1)) + Kokkos::abort( + "Invalid conversion from layout_right_padded to LayoutRight"); + } + } + return layout; + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +template <class MappingType, class ArrayLayout, size_t... Idx> +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + ArrayLayout layout, std::index_sequence<Idx...>) { + using index_type = typename MappingType::index_type; + using extents_type = typename MappingType::extents_type; + if constexpr (std::is_same_v<typename MappingType::layout_type, + layout_left> || + std::is_same_v<typename MappingType::layout_type, + layout_right>) { + return MappingType{ + extents_type{dextents<index_type, MappingType::extents_type::rank()>{ + layout.dimension[Idx]...}}}; + } else { + if (layout.stride == KOKKOS_IMPL_CTOR_DEFAULT_ARG || + extents_type::rank() < 2) { + return MappingType{ + extents_type{dextents<index_type, MappingType::extents_type::rank()>{ + layout.dimension[Idx]...}}}; + } else { + if constexpr (std::is_same_v<ArrayLayout, LayoutRight> && + extents_type::rank() > 2) { + size_t product_of_dimensions = 1; + for (size_t r = 1; r < extents_type::rank(); r++) + product_of_dimensions *= layout.dimension[r]; + if (product_of_dimensions != layout.stride) + Kokkos::abort( + "Invalid conversion from LayoutRight to layout_right_padded"); + } else { + return MappingType{ + extents_type{ + dextents<index_type, MappingType::extents_type::rank()>{ + layout.dimension[Idx]...}}, + layout.stride}; + } + } + } +} +template <class MappingType, size_t... Idx> +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride layout, std::index_sequence<Idx...>) { + static_assert( + std::is_same_v<typename MappingType::layout_type, layout_stride>); + using index_type = typename MappingType::index_type; + index_type strides[MappingType::extents_type::rank()] = { + layout.stride[Idx]...}; + return MappingType{ + mdspan_non_standard_tag(), + static_cast<typename MappingType::extents_type>( + dextents<index_type, MappingType::extents_type::rank()>{ + layout.dimension[Idx]...}), + strides}; +} + +// specialization for rank 0 to avoid empty array +template <class MappingType> +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride, std::index_sequence<>) { + return MappingType{}; +} + +template <class MappingType, class ArrayLayout> +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout(ArrayLayout layout) { + return mapping_from_array_layout_impl<MappingType>( + layout, std::make_index_sequence<MappingType::extents_type::rank()>()); +} + +template <class MDSpanType, class VM> +KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { + using mapping_type = typename MDSpanType::mapping_type; + using extents_type = typename mapping_type::extents_type; + + // std::span is not available in C++17 (our current requirements), + // so we need to use the std::array constructor for layout mappings. + // FIXME When C++20 is available, we can use std::span here instead + std::size_t strides[VM::Rank]; + view_mapping.stride_fill(&strides[0]); + if constexpr (std::is_same_v<typename mapping_type::layout_type, + Kokkos::layout_stride>) { + return mapping_type(Kokkos::mdspan_non_standard, + extents_from_view_mapping<extents_type>(view_mapping), + strides); + } else if constexpr (VM::Rank > 1 && + std::is_same_v<typename mapping_type::layout_type, + Kokkos::Experimental::layout_left_padded< + Kokkos::dynamic_extent>>) { + return mapping_type(extents_from_view_mapping<extents_type>(view_mapping), + strides[1]); + } else if constexpr (VM::Rank > 1 && + std::is_same_v<typename mapping_type::layout_type, + Kokkos::Experimental::layout_right_padded< + Kokkos::dynamic_extent>>) { + return mapping_type(extents_from_view_mapping<extents_type>(view_mapping), + strides[VM::Rank - 2]); + } else { + return mapping_type(extents_from_view_mapping<extents_type>(view_mapping)); + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif +} + +} // namespace Kokkos::Impl + +#endif // KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp index ebdf2c8211fe9209c2106e1df51b242cf760883c..79c137bfddd449a94e3d0e40c5a8b6700cdb4382 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -28,7 +28,9 @@ #include <Cuda/Kokkos_Cuda_Instance.hpp> #include <Cuda/Kokkos_Cuda_View.hpp> #include <Cuda/Kokkos_Cuda_Team.hpp> +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include <Cuda/Kokkos_Cuda_Task.hpp> +#endif #include <Cuda/Kokkos_Cuda_MDRangePolicy.hpp> #include <Cuda/Kokkos_Cuda_UniqueToken.hpp> #include <Cuda/Kokkos_Cuda_ZeroMemset.hpp> diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp index e115f7051f3a18434c04c5a5474267429ba0f3fb..cf405e57b8f91570545786c28130b158b5649811 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp @@ -25,9 +25,13 @@ #include <HIP/Kokkos_HIP_Half_Conversion.hpp> #include <HIP/Kokkos_HIP_Instance.hpp> #include <HIP/Kokkos_HIP_MDRangePolicy.hpp> -#include <HIP/Kokkos_HIP_Parallel_Range.hpp> -#include <HIP/Kokkos_HIP_Parallel_MDRange.hpp> -#include <HIP/Kokkos_HIP_Parallel_Team.hpp> +#include <HIP/Kokkos_HIP_ParallelFor_Range.hpp> +#include <HIP/Kokkos_HIP_ParallelFor_MDRange.hpp> +#include <HIP/Kokkos_HIP_ParallelFor_Team.hpp> +#include <HIP/Kokkos_HIP_ParallelReduce_Range.hpp> +#include <HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp> +#include <HIP/Kokkos_HIP_ParallelReduce_Team.hpp> +#include <HIP/Kokkos_HIP_ParallelScan_Range.hpp> #include <HIP/Kokkos_HIP_SharedAllocationRecord.hpp> #include <HIP/Kokkos_HIP_UniqueToken.hpp> #include <HIP/Kokkos_HIP_ZeroMemset.hpp> diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index bd12c5c6a99f9d52cba072e361fd4d661f261774..3570ed2b6e14d6b8060cec53f1da2f011fd42d73 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -19,6 +19,9 @@ #if defined(KOKKOS_ENABLE_SYCL) #include <SYCL/Kokkos_SYCL.hpp> +#ifdef SYCL_EXT_ONEAPI_GRAPH +#include <SYCL/Kokkos_SYCL_GraphNodeKernel.hpp> +#endif #include <SYCL/Kokkos_SYCL_Half_Impl_Type.hpp> #include <SYCL/Kokkos_SYCL_Half_Conversion.hpp> #include <SYCL/Kokkos_SYCL_DeepCopy.hpp> @@ -32,6 +35,16 @@ #include <SYCL/Kokkos_SYCL_ParallelScan_Range.hpp> #include <SYCL/Kokkos_SYCL_UniqueToken.hpp> #include <SYCL/Kokkos_SYCL_ZeroMemset.hpp> + +namespace Kokkos { +namespace Experimental { +using SYCLDeviceUSMSpace = ::Kokkos::SYCLDeviceUSMSpace; +using SYCLHostUSMSpace = ::Kokkos::SYCLHostUSMSpace; +using SYCLSharedUSMSpace = ::Kokkos::SYCLSharedUSMSpace; +using SYCL = ::Kokkos::SYCL; +} // namespace Experimental +} // namespace Kokkos + #endif #endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp index f5cbc0c1d1d67886719c2d512bdbab5ce371898b..4d7caec6f5fa43c46b31f2d632f15747013c9380 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -19,7 +19,7 @@ #if defined(KOKKOS_ENABLE_THREADS) #include <Threads/Kokkos_Threads.hpp> -#include <Threads/Kokkos_ThreadsExec.hpp> +#include <Threads/Kokkos_Threads_Instance.hpp> #include <Threads/Kokkos_Threads_MDRangePolicy.hpp> #include <Threads/Kokkos_Threads_ParallelFor_Range.hpp> #include <Threads/Kokkos_Threads_ParallelFor_MDRange.hpp> @@ -28,7 +28,7 @@ #include <Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp> #include <Threads/Kokkos_Threads_ParallelReduce_Team.hpp> #include <Threads/Kokkos_Threads_ParallelScan_Range.hpp> -#include <Threads/Kokkos_ThreadsTeam.hpp> +#include <Threads/Kokkos_Threads_Team.hpp> #include <Threads/Kokkos_Threads_UniqueToken.hpp> #endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp index 400794f86591f41946d31f1fed42e64466fe12de..399b986041e915787454c2cc4256c6d990302b20 100644 --- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -19,7 +19,6 @@ #if defined(KOKKOS_ENABLE_SYCL) namespace Kokkos { -namespace Experimental { class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from ///< the host class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL @@ -27,7 +26,6 @@ class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL class SYCLHostUSMSpace; ///< Memory space accessible from both the SYCL ///< device and the host (host pinned) class SYCL; ///< Execution space for SYCL -} // namespace Experimental } // namespace Kokkos #endif #endif diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index a44ffefa6b72d489d2027572f2566fe64a492057..a9db2c4cf4a3539cfd44b215947f7015377aa493 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1458,7 +1458,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<1, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1477,7 +1477,7 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<2, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1496,7 +1496,7 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<3, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1515,7 +1515,7 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<4, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1534,7 +1534,7 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<5, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1553,7 +1553,7 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<6, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1572,7 +1572,7 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<7, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1591,7 +1591,7 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template <bool IsLeft, typename IType, typename Tagged> struct Tile_Loop_Type<8, IsLeft, IType, Tagged, - std::enable_if_t<!std::is_void<Tagged>::value>> { + std::enable_if_t<!std::is_void_v<Tagged>>> { template <typename Func, typename Offset, typename ExtentA, typename ExtentB> static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1616,7 +1616,7 @@ struct HostIterateTile; // For ParallelFor template <typename RP, typename Functor, typename Tag, typename ValueType> struct HostIterateTile<RP, Functor, Tag, ValueType, - std::enable_if_t<std::is_void<ValueType>::value>> { + std::enable_if_t<std::is_void_v<ValueType>>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1635,12 +1635,11 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, } else { is_full_tile = false; partial_tile[i] = - (m_rp.m_upper[i] - 1 - offset[i]) == 0 - ? 1 - : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2000,30 +1999,28 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, #endif template <typename... Args> - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void<Tag>::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v<Tag>), void> apply(Args&&... args) const { m_func(args...); } template <typename... Args> - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void<Tag>::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v<Tag>), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; - std::conditional_t<std::is_void<Tag>::value, int, Tag> m_tag; + std::conditional_t<std::is_void_v<Tag>, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template <typename RP, typename Functor, typename Tag, typename ValueType> struct HostIterateTile<RP, Functor, Tag, ValueType, - std::enable_if_t<!std::is_void<ValueType>::value && - !std::is_array<ValueType>::value>> { + std::enable_if_t<!std::is_void_v<ValueType> && + !std::is_array_v<ValueType>>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2050,12 +2047,11 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, } else { is_full_tile = false; partial_tile[i] = - (m_rp.m_upper[i] - 1 - offset[i]) == 0 - ? 1 - : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2430,7 +2426,7 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, RP const m_rp; Functor const m_func; - std::conditional_t<std::is_void<Tag>::value, int, Tag> m_tag; + std::conditional_t<std::is_void_v<Tag>, int, Tag> m_tag; }; // For ParallelReduce @@ -2438,8 +2434,8 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, // ValueType[]: For array reductions template <typename RP, typename Functor, typename Tag, typename ValueType> struct HostIterateTile<RP, Functor, Tag, ValueType, - std::enable_if_t<!std::is_void<ValueType>::value && - std::is_array<ValueType>::value>> { + std::enable_if_t<!std::is_void_v<ValueType> && + std::is_array_v<ValueType>>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2463,12 +2459,11 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, } else { is_full_tile = false; partial_tile[i] = - (m_rp.m_upper[i] - 1 - offset[i]) == 0 - ? 1 - : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2842,7 +2837,7 @@ struct HostIterateTile<RP, Functor, Tag, ValueType, RP const m_rp; Functor const m_func; - std::conditional_t<std::is_void<Tag>::value, int, Tag> m_tag; + std::conditional_t<std::is_void_v<Tag>, int, Tag> m_tag; }; // ------------------------------------------------------------------ // diff --git a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp index e1273ab9e3bd65804072ada103b556a29228053b..e6b2fcbef4bc499f716398565b68c4a43049186c 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp +++ b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -41,13 +41,13 @@ struct EmulateCUDADim3 { template <class Tag, class Functor, class... Args> KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t<std::is_void<Tag>::value> _tag_invoke(Functor const& f, Args&&... args) { - f((Args &&) args...); + f((Args&&)args...); } template <class Tag, class Functor, class... Args> KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t<!std::is_void<Tag>::value> _tag_invoke(Functor const& f, Args&&... args) { - f(Tag{}, (Args &&) args...); + f(Tag{}, (Args&&)args...); } template <class Tag, class Functor, class T, size_t N, size_t... Idxs, @@ -55,7 +55,7 @@ template <class Tag, class Functor, class T, size_t N, size_t... Idxs, KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array_helper( Functor const& f, T (&vals)[N], std::integer_sequence<size_t, Idxs...>, Args&&... args) { - _tag_invoke<Tag>(f, vals[Idxs]..., (Args &&) args...); + _tag_invoke<Tag>(f, vals[Idxs]..., (Args&&)args...); } template <class Tag, class Functor, class T, size_t N, class... Args> @@ -63,7 +63,7 @@ KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, T (&vals)[N], Args&&... args) { _tag_invoke_array_helper<Tag>(f, vals, std::make_index_sequence<N>{}, - (Args &&) args...); + (Args&&)args...); } // ------------------------------------------------------------------ // diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index d77ec0c7537fd465c9029c89705e64d913f35168..b483653021a58bc6b18771b0e78fc4049f17fe9d 100644 --- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -143,7 +143,7 @@ struct AnalyzeExecPolicyUseMatcher<void, type_list<>, Trait, Traits...> { static constexpr auto trigger_error_message = show_name_of_invalid_execution_policy_trait<Trait>{}; static_assert( - /* always false: */ std::is_void<Trait>::value, + /* always false: */ std::is_void_v<Trait>, "Unknown execution policy trait. Search compiler output for " "'show_name_of_invalid_execution_policy_trait' to see the type of the " "invalid trait."); diff --git a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp index d8ab77b205639889980a0b1f3994ed292b1e1aee..4ea0b8d343b58ddf027725580a81cc0ba06d9e1b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -95,12 +95,12 @@ struct non_owning_variable_size_circular_buffer { non_owning_variable_size_circular_buffer( non_owning_variable_size_circular_buffer const&) = delete; non_owning_variable_size_circular_buffer( - non_owning_variable_size_circular_buffer&&) = default; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer&&) = default; + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer const&) = delete; - non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer& operator=( non_owning_variable_size_circular_buffer&&) = default; - ~non_owning_variable_size_circular_buffer() = default; + ~non_owning_variable_size_circular_buffer() = default; KOKKOS_FORCEINLINE_FUNCTION constexpr size_type size() const noexcept { return m_size; } @@ -138,7 +138,7 @@ struct ChaseLevDeque { public: template <class _ignore = void, class = std::enable_if_t< - std::is_default_constructible<CircularBufferT>::value>> + std::is_default_constructible_v<CircularBufferT>>> ChaseLevDeque() : m_array() {} explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} @@ -165,7 +165,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; @@ -226,7 +226,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp index 6e3d99ebd685308cc29543fde5475ec148b717c6..ee53fd8bc6d4ff50f204c73f5b5404320b3ae80a 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -27,8 +27,9 @@ // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md -#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +#if defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) intel_get_cycle_counter(); #endif @@ -55,8 +56,10 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { // Return value of 64-bit hi-res clock register. return clock64(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL We can only return something useful for Intel GPUs and with RDC +#elif defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) return intel_get_cycle_counter(); diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index e6dd3c63391d13b75765a39d02908bbfd1fed819..d7319e80c8715b4dd75382554cfb85cdc9e049c5 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -93,7 +93,7 @@ struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>, std::move(arg_values))... {} template <size_t Idx, class ValueType> - KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref(); } template <size_t Idx, class ValueType> @@ -181,7 +181,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space, KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; + CombinedReducerImpl&&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( @@ -192,8 +192,8 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space, template <class... ReducersDeduced> KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( value_type& value, ReducersDeduced&&... reducers) noexcept - : CombinedReducerStorageImpl<Idxs, Reducers>((ReducersDeduced &&) - reducers)..., + : CombinedReducerStorageImpl<Idxs, Reducers>( + (ReducersDeduced&&)reducers)..., m_value_view(&value) {} KOKKOS_FUNCTION constexpr void join(value_type& dest, @@ -348,8 +348,8 @@ struct CombinedReductionFunctorWrapperImpl< IndexOrMemberOrTagType1&& arg_first, IndexOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl<IndexOrMemberOrTagType1&&>( - (IndexOrMemberOrTagType1 &&) arg_first, - (IndexOrMemberTypesThenValueType &&) args...); + (IndexOrMemberOrTagType1&&)arg_first, + (IndexOrMemberTypesThenValueType&&)args...); } // </editor-fold> end call operator }}}2 @@ -369,19 +369,19 @@ struct CombinedReductionFunctorWrapperImpl< template <class... IdxOrMemberTypes, class IdxOrMemberType1, class... IdxOrMemberTypesThenValueType> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_same<remove_cvref_t<IdxOrMemberType1>, value_type>::value> + !std::is_same_v<remove_cvref_t<IdxOrMemberType1>, value_type>> _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, IdxOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl<IdxOrMemberTypes&&..., IdxOrMemberType1&&>( - (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, - (IdxOrMemberTypesThenValueType &&) args...); + (IdxOrMemberTypes&&)idxs..., (IdxOrMemberType1&&)idx, + (IdxOrMemberTypesThenValueType&&)args...); } // base case template <class... IdxOrMemberTypes> KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, value_type& out) const { - m_functor((IdxOrMemberTypes &&) idxs..., + m_functor((IdxOrMemberTypes&&)idxs..., out.template get<Idxs, typename Reducers::value_type>()...); } }; @@ -464,8 +464,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( typename _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>::value_type...>{ // This helper function is now poorly named after refactoring. - _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) - args)...}; + _get_value_from_combined_reducer_ctor_arg( + (ReferencesOrViewsOrReducers&&)args)...}; //---------------------------------------- } @@ -480,7 +480,7 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( Space, _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>...>; return reducer_type(value, _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>{ - (ReferencesOrViewsOrReducers &&) args}...); + (ReferencesOrViewsOrReducers&&)args}...); //---------------------------------------- } diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp index ca4edce5c3884ac89e24c578767d4a15f231ccf1..9bde2f72a3ff7b23d2ca2fd9d7b6952f3a6369d2 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -110,15 +110,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast<volatile int *>(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast<uint32_t *>(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast<uint32_t *>(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -132,7 +132,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast<uint32_t *>(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -194,15 +195,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast<volatile int *>(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast<uint32_t *>(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast<uint32_t *>(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -216,7 +217,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast<uint32_t *>(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -262,8 +264,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + const uint32_t prev = Kokkos::atomic_fetch_and( + const_cast<uint32_t *>(buffer) + (bit >> bits_per_int_lg2) + 1, ~mask); if (!(prev & mask)) { return -1; @@ -273,7 +275,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast<uint32_t *>(buffer), 1); // Flush the store-release Kokkos::memory_fence(); @@ -299,8 +301,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast<uint32_t *>(buffer) + (bit >> bits_per_int_lg2) + 1, mask); if (!(prev & mask)) { return -1; @@ -310,7 +312,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast<uint32_t *>(buffer), 1); return (count & state_used_mask) - 1; } diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp index 5c182db5663a7a4197404d6ea2d8f37defc25476..72f33ffaab909be732a2db3a57c907b828491e15 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp @@ -90,9 +90,8 @@ void combine(Kokkos::InitializationSettings& out, KOKKOS_IMPL_COMBINE_SETTING(num_threads); KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by); KOKKOS_IMPL_COMBINE_SETTING(device_id); - KOKKOS_IMPL_COMBINE_SETTING(num_devices); - KOKKOS_IMPL_COMBINE_SETTING(skip_device); KOKKOS_IMPL_COMBINE_SETTING(disable_warnings); + KOKKOS_IMPL_COMBINE_SETTING(print_configuration); KOKKOS_IMPL_COMBINE_SETTING(tune_internals); KOKKOS_IMPL_COMBINE_SETTING(tools_help); KOKKOS_IMPL_COMBINE_SETTING(tools_libs); @@ -131,11 +130,15 @@ void combine(Kokkos::Tools::InitArguments& out, int get_device_count() { #if defined(KOKKOS_ENABLE_CUDA) - return Kokkos::Cuda::detect_device_count(); + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_HIP) - return Kokkos::HIP::detect_device_count(); + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_SYCL) - return sycl::device::get_devices(sycl::info::device_type::gpu).size(); + return Kokkos::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -165,20 +168,43 @@ bool is_valid_map_device_id_by(std::string const& x) { } // namespace +std::vector<int> const& Kokkos::Impl::get_visible_devices() { + static auto devices = get_visible_devices(get_device_count()); + return devices; +} + [[nodiscard]] int Kokkos::device_id() noexcept { #if defined(KOKKOS_ENABLE_CUDA) - return Cuda().cuda_device(); + int device = Cuda().cuda_device(); #elif defined(KOKKOS_ENABLE_HIP) - return HIP().hip_device(); + int device = HIP().hip_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - return Experimental::OpenACC().acc_device_number(); + int device = Experimental::OpenACC().acc_device_number(); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_default_device(); // FIXME_OPENMPTARGET + int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - return Experimental::Impl::SYCLInternal::m_syclDev; + int device = Impl::SYCLInternal::m_syclDev; #else - return -1; + int device = -1; + return device; #endif + auto const& visible_devices = Impl::get_visible_devices(); + for (std::size_t i = 0; i < visible_devices.size(); ++i) { + if (visible_devices[i] == device) { + return i; + } + } + Kokkos::abort("Unexpected error: cannot determine device id"); + return -1; +} + +[[nodiscard]] int Kokkos::num_devices() noexcept { + if constexpr (std::is_same_v<DefaultExecutionSpace, + DefaultHostExecutionSpace>) { + return -1; // no GPU backend enabled + } else { + return Impl::get_visible_devices().size(); + } } [[nodiscard]] int Kokkos::num_threads() noexcept { @@ -245,7 +271,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: local rank " << local_rank << " is outside the bounds of resource groups provided by CTest. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the resource types allocated to this resource group @@ -258,7 +284,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE @@ -282,7 +308,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: device type '" << ctest_kokkos_device_type << "' not included in " << ctest_resource_group_name << ". Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the device ID @@ -298,7 +324,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_id_name << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } auto const* comma = std::strchr(resource_str, ','); @@ -306,15 +332,14 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } std::string id(resource_str + 3, comma - resource_str - 3); return std::stoi(id.c_str()); } -std::vector<int> Kokkos::Impl::get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count) { +std::vector<int> Kokkos::Impl::get_visible_devices(int device_count) { std::vector<int> visible_devices; char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES"); if (env_visible_devices) { @@ -341,30 +366,9 @@ std::vector<int> Kokkos::Impl::get_visible_devices( } } } else { - int num_devices = - settings.has_num_devices() ? settings.get_num_devices() : device_count; - if (num_devices > device_count) { - std::stringstream ss; - ss << "Error: Specified number of devices '" << num_devices - << "' exceeds the actual number of GPUs available for execution '" - << device_count << "'." - << " Raised by Kokkos::initialize().\n"; - Kokkos::abort(ss.str().c_str()); - } - for (int i = 0; i < num_devices; ++i) { + for (int i = 0; i < device_count; ++i) { visible_devices.push_back(i); } - if (settings.has_skip_device()) { - if (visible_devices.size() == 1 && settings.get_skip_device() == 0) { - Kokkos::abort( - "Error: skipping the only GPU available for execution.\n" - " Raised by Kokkos::initialize().\n"); - } - visible_devices.erase( - std::remove(visible_devices.begin(), visible_devices.end(), - settings.get_skip_device()), - visible_devices.end()); - } } if (visible_devices.empty()) { Kokkos::abort( @@ -374,10 +378,10 @@ std::vector<int> Kokkos::Impl::get_visible_devices( return visible_devices; } -int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { - std::vector<int> visible_devices = - get_visible_devices(settings, get_device_count()); - int const num_devices = visible_devices.size(); +std::optional<int> Kokkos::Impl::get_gpu( + const InitializationSettings& settings) { + std::vector<int> visible_devices = get_visible_devices(get_device_count()); + int const num_devices = visible_devices.size(); // device_id is provided if (settings.has_device_id()) { int const id = settings.get_device_id(); @@ -423,14 +427,15 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { int const mpi_local_rank = mpi_local_rank_on_node(); - // use first GPU available for execution if unable to detect local MPI rank + // if unable to detect local MPI rank return nullopt to delegate device + // selection to the backend if (mpi_local_rank < 0) { if (settings.has_map_device_id_by()) { std::cerr << "Warning: unable to detect local MPI rank." << " Falling back to the first GPU available for execution." << " Raised by Kokkos::initialize()." << std::endl; } - return visible_devices[0]; + return std::nullopt; } // use device assigned by CTest when resource allocation is activated @@ -445,13 +450,6 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { namespace { void initialize_backends(const Kokkos::InitializationSettings& settings) { -// This is an experimental setting -// For KNL in Flat mode this variable should be set, so that -// memkind allocates high bandwidth memory correctly. -#ifdef KOKKOS_ENABLE_HBWSPACE - setenv("MEMKIND_HBW_NODES", "1", 0); -#endif - Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings); } @@ -571,19 +569,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "no"); #endif -#ifdef KOKKOS_ENABLE_HBWSPACE - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no"); -#endif -#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "no"); -#endif - #ifdef KOKKOS_ENABLE_ASM declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes"); #else @@ -604,6 +589,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX23", "no"); #endif +#ifdef KOKKOS_ENABLE_CXX26 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "no"); +#endif #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", "yes"); @@ -616,18 +606,14 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no"); #endif -#ifdef KOKKOS_ENABLE_LIBRT - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes"); -#else - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no"); -#endif #ifdef KOKKOS_ENABLE_LIBDL declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes"); #else declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "no"); #endif + declare_configuration_metadata("architecture", "Default Device", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); #if defined(KOKKOS_ARCH_A64FX) declare_configuration_metadata("architecture", "CPU architecture", "A64FX"); @@ -645,8 +631,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "ARMV8_THUNDERX2"); #elif defined(KOKKOS_ARCH_BDW) declare_configuration_metadata("architecture", "CPU architecture", "BDW"); -#elif defined(KOKKOS_ARCH_BGQ) - declare_configuration_metadata("architecture", "CPU architecture", "BGQ"); #elif defined(KOKKOS_ARCH_HSW) declare_configuration_metadata("architecture", "CPU architecture", "HSW"); #elif defined(KOKKOS_ARCH_ICL) @@ -659,8 +643,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "KNL"); #elif defined(KOKKOS_ARCH_NATIVE) declare_configuration_metadata("architecture", "CPU architecture", "NATIVE"); -#elif defined(KOKKOS_ARCH_POWER7) - declare_configuration_metadata("architecture", "CPU architecture", "POWER7"); #elif defined(KOKKOS_ARCH_POWER8) declare_configuration_metadata("architecture", "CPU architecture", "POWER8"); #elif defined(KOKKOS_ARCH_POWER9) @@ -673,8 +655,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "SNB"); #elif defined(KOKKOS_ARCH_SPR) declare_configuration_metadata("architecture", "CPU architecture", "SPR"); -#elif defined(KOKKOS_ARCH_WSM) - declare_configuration_metadata("architecture", "CPU architecture", "WSM"); #elif defined(KOKKOS_ARCH_AMD_ZEN) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN"); #elif defined(KOKKOS_ARCH_AMD_ZEN2) @@ -683,6 +663,12 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_ZEN3) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN3"); +#elif defined(KOKKOS_ARCH_RISCV_SG2042) + declare_configuration_metadata("architecture", "CPU architecture", + "SG2042 (RISC-V)") +#elif defined(KOKKOS_ARCH_RISCV_RVA22V) + declare_configuration_metadata("architecture", "CPU architecture", + "RVA22V (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -752,11 +738,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_ADA89) declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) - declare_configuration_metadata("architecture", "GPU architecture", - "HOPPER90"); + declare_configuration_metadata("architecture", "GPU architecture", + "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX906"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); @@ -769,6 +755,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_GFX1100) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX1100"); +#elif defined(KOKKOS_ARCH_AMD_GFX1103) + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX1103"); #else declare_configuration_metadata("architecture", "GPU architecture", "none"); @@ -804,34 +793,18 @@ void initialize_internal(const Kokkos::InitializationSettings& settings) { post_initialize_internal(settings); } -void pre_finalize_internal() { - typename decltype(finalize_hooks)::size_type numSuccessfulCalls = 0; +// declared noexcept such that std::terminate is called if any of the registered +// function throws +void call_registered_finalize_hook_functions() noexcept { while (!finalize_hooks.empty()) { - auto f = finalize_hooks.top(); - try { - f(); - } catch (...) { - std::cerr << "Kokkos::finalize: A finalize hook (set via " - "Kokkos::push_finalize_hook) threw an exception that it did " - "not catch." - " Per std::atexit rules, this results in std::terminate. " - "This is " - "finalize hook number " - << numSuccessfulCalls - << " (1-based indexing) " - "out of " - << finalize_hooks.size() - << " to call. Remember that " - "Kokkos::finalize calls finalize hooks in reverse order " - "from how they " - "were pushed." - << std::endl; - std::terminate(); - } + auto const& func = finalize_hooks.top(); + func(); finalize_hooks.pop(); - ++numSuccessfulCalls; } +} +void pre_finalize_internal() { + call_registered_finalize_hook_functions(); Kokkos::Profiling::finalize(); } @@ -911,36 +884,18 @@ void Kokkos::Impl::parse_command_line_arguments( int num_threads; int device_id; - int num_devices; // deprecated - int skip_device; // deprecated std::string map_device_id_by; bool disable_warnings; bool print_configuration; bool tune_internals; - auto get_flag = [](std::string s) -> std::string { - return s.erase(s.find('=')); - }; - bool help_flag = false; int iarg = 0; while (iarg < argc) { bool remove_flag = false; - if (check_arg(argv[iarg], "--kokkos-numa") || - check_arg(argv[iarg], "--numa")) { - warn_deprecated_command_line_argument(get_flag(argv[iarg])); - // remove flag if prefixed with '--kokkos-' - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) || - check_arg_int(argv[iarg], "--num-threads", num_threads) || - check_arg_int(argv[iarg], "--kokkos-threads", num_threads) || - check_arg_int(argv[iarg], "--threads", num_threads)) { - if (get_flag(argv[iarg]) != "--kokkos-num-threads") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-num-threads"); - } + if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads)) { if (!is_valid_num_threads(num_threads)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -949,15 +904,8 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_num_threads(num_threads); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) || - check_arg_int(argv[iarg], "--device-id", device_id) || - check_arg_int(argv[iarg], "--kokkos-device", device_id) || - check_arg_int(argv[iarg], "--device", device_id)) { - if (get_flag(argv[iarg]) != "--kokkos-device-id") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-device-id"); - } + remove_flag = true; + } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id)) { if (!is_valid_device_id(device_id)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -966,70 +914,7 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_device_id(device_id); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices") || - check_arg(argv[iarg], "--ndevices")) { - if (check_arg(argv[iarg], "--num-devices")) { - warn_deprecated_command_line_argument("--num-devices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--ndevices")) { - warn_deprecated_command_line_argument("--ndevices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--kokkos-ndevices")) { - warn_deprecated_command_line_argument("--kokkos-ndevices", - "--kokkos-num-devices"); - } - warn_deprecated_command_line_argument( - "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank"); - // Find the number of device (expecting --device=XX) - if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) || - (strncmp(argv[iarg], "--num-devices=", 14) == 0) || - (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) || - (strncmp(argv[iarg], "--ndevices=", 11) == 0))) - throw_runtime_exception( - "Error: expecting an '=INT[,INT]' after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - - char* num1 = strchr(argv[iarg], '=') + 1; - char* num2 = strpbrk(num1, ","); - int num1_len = num2 == nullptr ? strlen(num1) : num2 - num1; - char* num1_only = new char[num1_len + 1]; - strncpy(num1_only, num1, num1_len); - num1_only[num1_len] = '\0'; - - if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) { - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - } - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - num_devices = std::stoi(num1_only); - settings.set_num_devices(num_devices); - settings.set_map_device_id_by("mpi_rank"); - } - delete[] num1_only; - - if (num2 != nullptr) { - if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1)) - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices=XX,'." - " Raised by Kokkos::initialize()."); - - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - skip_device = std::stoi(num2 + 1); - settings.set_skip_device(skip_device); - } - } - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; + remove_flag = true; } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings", disable_warnings)) { settings.set_disable_warnings(disable_warnings); @@ -1094,13 +979,10 @@ void Kokkos::Impl::parse_environment_variables( Tools::Impl::parse_environment_variables(tools_init_arguments); if (init_result.result == Tools::Impl::InitializationStatus::environment_argument_mismatch) { - Impl::throw_runtime_exception(init_result.error_message); + Kokkos::abort(init_result.error_message.c_str()); } combine(settings, tools_init_arguments); - if (std::getenv("KOKKOS_NUMA")) { - warn_deprecated_environment_variable("KOKKOS_NUMA"); - } int num_threads; if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) { if (!is_valid_num_threads(num_threads)) { @@ -1125,34 +1007,6 @@ void Kokkos::Impl::parse_environment_variables( } settings.set_device_id(device_id); } - int num_devices; - int rand_devices; - bool has_num_devices = check_env_int("KOKKOS_NUM_DEVICES", num_devices); - bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices); - if (has_rand_devices && has_num_devices) { - Impl::throw_runtime_exception( - "Error: cannot specify both KOKKOS_NUM_DEVICES and " - "KOKKOS_RAND_DEVICES." - " Raised by Kokkos::initialize()."); - } - if (has_num_devices) { - warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank"); - settings.set_map_device_id_by("mpi_rank"); - settings.set_num_devices(num_devices); - } - if (has_rand_devices) { - warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=random"); - settings.set_map_device_id_by("random"); - settings.set_num_devices(rand_devices); - } - if (has_num_devices || has_rand_devices) { - int skip_device; - if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) { - settings.set_skip_device(skip_device); - } - } bool disable_warnings; if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) { settings.set_disable_warnings(disable_warnings); diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp index c71c21d2ac988d923710ada399ddbd8c0384570a..cd00fdadebaffa69ce925e55d25d3463768bc68c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -36,15 +36,22 @@ struct GraphNodeKernelDefaultImpl { // TODO @graphs decide if this should use vtable or intrusive erasure via // function pointers like in the rest of the graph interface virtual void execute_kernel() = 0; + + GraphNodeKernelDefaultImpl() = default; + + explicit GraphNodeKernelDefaultImpl(ExecutionSpace exec) + : m_execution_space(std::move(exec)) {} + + ExecutionSpace m_execution_space; }; // TODO Indicate that this kernel specialization is only for the Host somehow? template <class ExecutionSpace, class PolicyType, class Functor, class PatternTag, class... Args> class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, - Args..., ExecutionSpace>::type, - public GraphNodeKernelDefaultImpl<ExecutionSpace> { + : public GraphNodeKernelDefaultImpl<ExecutionSpace>, + public PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, + Args..., ExecutionSpace>::type { public: using base_t = typename PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, @@ -58,22 +65,24 @@ class GraphNodeKernelImpl // TODO @graph kernel name info propagation template <class PolicyDeduced, class... ArgsDeduced> - GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...), - execute_kernel_vtable_base_t() {} + GraphNodeKernelImpl(std::string const &, ExecutionSpace const &, + Functor arg_functor, PolicyDeduced &&arg_policy, + ArgsDeduced &&...args) + : execute_kernel_vtable_base_t(arg_policy.space()), + base_t(std::move(arg_functor), (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) {} // FIXME @graph Forward through the instance once that works in the backends template <class PolicyDeduced, class... ArgsDeduced> - GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, - PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + GraphNodeKernelImpl(ExecutionSpace const &ex, Functor arg_functor, + PolicyDeduced &&arg_policy, ArgsDeduced &&...args) : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) { + // FIXME This constructor seem unused. + } - void execute_kernel() final { this->base_t::execute(); } + void execute_kernel() override final { this->base_t::execute(); } }; // </editor-fold> end GraphNodeKernelImpl }}}1 @@ -88,7 +97,7 @@ struct GraphNodeAggregateKernelDefaultImpl using is_graph_kernel = std::true_type; }; using graph_kernel = GraphNodeAggregateKernelDefaultImpl; - void execute_kernel() final {} + void execute_kernel() override final {} }; } // end namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp index 223ae391ab40f1b4347a4a7613d41fddd9ad69f0..31d147ea894bda05367deb20f1c5f76990efa923 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -69,10 +69,10 @@ struct GraphNodeBackendSpecificDetails { GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails const&) = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails&&) noexcept = delete; ~GraphNodeBackendSpecificDetails() = default; @@ -92,6 +92,18 @@ struct GraphNodeBackendSpecificDetails { m_is_aggregate = true; } + // A node is awaitable if it can execute a kernel. + // A root node or an aggregate node cannot be waited for, because it does + // not launch anything. + bool awaitable() const { return (!m_is_root) && (!m_is_aggregate); } + + // Retrieve the execution space instance that has been passed to + // the kernel at construction phase. + const ExecutionSpace& get_execution_space() const { + KOKKOS_EXPECTS(m_kernel_ptr != nullptr) + return m_kernel_ptr->m_execution_space; + } + void set_predecessor( std::shared_ptr<GraphNodeBackendSpecificDetails<ExecutionSpace>> arg_pred_impl) { @@ -104,7 +116,7 @@ struct GraphNodeBackendSpecificDetails { m_predecessors.push_back(std::move(arg_pred_impl)); } - void execute_node() { + void execute_node(const ExecutionSpace& exec) { // This node could have already been executed as the predecessor of some // other KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) @@ -115,8 +127,18 @@ struct GraphNodeBackendSpecificDetails { // supported semantics, but instinct I have feels like it should be... m_has_executed = true; for (auto const& predecessor : m_predecessors) { - predecessor->execute_node(); + predecessor->execute_node(exec); } + + // Before executing the kernel, be sure to fence the execution space + // instance of predecessors. + for (const auto& predecessor : m_predecessors) { + if (predecessor->awaitable() && + predecessor->get_execution_space() != this->get_execution_space()) + predecessor->get_execution_space().fence( + "Kokkos::DefaultGraphNode::execute_node: sync with predecessors"); + } + m_kernel_ptr->execute_kernel(); } KOKKOS_ENSURES(m_has_executed) diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 3693dff3d465e66730cf75f488b65a6b19c84020..8dfa19a178cff477c5697e3e8f9b906d7cd947ae 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -56,14 +56,14 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage<ExecutionSpace> { //---------------------------------------------------------------------------- // <editor-fold desc="Constructors, destructor, and assignment"> {{{2 - // Not moveable or copyable; it spends its whole live as a shared_ptr in the + // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object - GraphImpl() = default; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; - ~GraphImpl() = default; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; explicit GraphImpl(ExecutionSpace arg_space) : execution_space_instance_storage_base_t(std::move(arg_space)) {} @@ -82,10 +82,7 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage<ExecutionSpace> { template <class NodeImpl> // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl void add_node(std::shared_ptr<NodeImpl> const& arg_node_ptr) { - static_assert( - NodeImpl::kernel_type::Policy::is_graph_kernel::value, - "Something has gone horribly wrong, but it's too complicated to " - "explain here. Buy Daisy a coffee and she'll explain it to you."); + static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); // Since this is always called before any calls to add_predecessor involving // it, we can treat this node as a sink until we discover otherwise. arg_node_ptr->node_details_t::set_kernel(arg_node_ptr->get_kernel()); @@ -139,17 +136,40 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage<ExecutionSpace> { return rv; } - void submit() { + void instantiate() { + KOKKOS_EXPECTS(!m_has_been_instantiated); + m_has_been_instantiated = true; + } + + void submit(const ExecutionSpace& exec) { + if (!m_has_been_instantiated) instantiate(); // This reset is gross, but for the purposes of our simple host // implementation... for (auto& sink : m_sinks) { sink->reset_has_executed(); } + + // We don't know where the nodes will execute, so we need to fence the given + // execution space instance before proceeding. This is the simplest way + // of guaranteeing that the kernels in the graph are correctly "enqueued". + exec.fence( + "Kokkos::DefaultGraph::submit: fencing before launching graph nodes"); + for (auto& sink : m_sinks) { - sink->execute_node(); + sink->execute_node(exec); + } + + // Once all sinks have been executed, we need to fence them. + for (const auto& sink : m_sinks) { + if (sink->awaitable() && sink->get_execution_space() != exec) + sink->get_execution_space().fence( + "Kokkos::DefaultGraph::submit: fencing before ending graph submit"); } } + private: + bool m_has_been_instantiated = false; + // </editor-fold> end required customizations }}}2 //---------------------------------------------------------------------------- }; diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp b/packages/kokkos/core/src/impl/Kokkos_DesulAtomicsConfig.hpp similarity index 72% rename from packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp rename to packages/kokkos/core/src/impl/Kokkos_DesulAtomicsConfig.hpp index 4cf170f5f1317e380a91196a9aaa36ec519c45df..02ab127d5c5d0613a4fee44c6259b8939c24d584 100644 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Config.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_DesulAtomicsConfig.hpp @@ -13,15 +13,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include <Kokkos_Macros.hpp> -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_ATOMICS_DESUL_CONFIG_HPP -#define KOKKOS_ATOMICS_DESUL_CONFIG_HPP -#include <Kokkos_Macros.hpp> +#ifndef KOKKOS_DESUL_ATOMICS_CONFIG_HPP +#define KOKKOS_DESUL_ATOMICS_CONFIG_HPP #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) #define DESUL_CUDA_ARCH_IS_PRE_PASCAL @@ -32,4 +26,4 @@ static_assert(false, #define DESUL_CUDA_ARCH_IS_PRE_VOLTA #endif -#endif // KOKKOS_ATOMICS_DESUL_CONFIG_HPP +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp b/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp index bd89c8b19ca9d323d12d9139c885dfefa92e0bc1..70dca5d8fadebd813d02876c94cb5d83d47e3008 100644 --- a/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp @@ -17,17 +17,17 @@ #ifndef KOKKOS_DEVICE_MANAGEMENT_HPP #define KOKKOS_DEVICE_MANAGEMENT_HPP +#include <optional> #include <vector> namespace Kokkos { class InitializationSettings; namespace Impl { -int get_gpu(const Kokkos::InitializationSettings& settings); +std::optional<int> get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); -// ditto -std::vector<int> get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count); +std::vector<int> get_visible_devices(int device_count); // test-only +std::vector<int> const& get_visible_devices(); // use this instead } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp index 8ba94ba4ccc46bd2f8fd93cce0347cdf8ab4b067..a8a4d6617bcdd796b4fd866a90bcfaf25f78af3c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -52,16 +52,16 @@ struct EBOBaseImpl; template <class T, template <class...> class CtorNotOnDevice> struct EBOBaseImpl<T, true, CtorNotOnDevice> { template <class... Args, class _ignored = void, - std::enable_if_t<std::is_void<_ignored>::value && - std::is_constructible<T, Args...>::value && + std::enable_if_t<std::is_void_v<_ignored> && + std::is_constructible_v<T, Args...> && !CtorNotOnDevice<Args...>::value, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&...) noexcept {} template <class... Args, class _ignored = void, - std::enable_if_t<std::is_void<_ignored>::value && - std::is_constructible<T, Args...>::value && + std::enable_if_t<std::is_void_v<_ignored> && + std::is_constructible_v<T, Args...> && CtorNotOnDevice<Args...>::value, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} @@ -110,18 +110,18 @@ struct EBOBaseImpl<T, false, CTorsNotOnDevice> { T m_ebo_object; template <class... Args, class _ignored = void, - std::enable_if_t<std::is_void<_ignored>::value && + std::enable_if_t<std::is_void_v<_ignored> && !CTorsNotOnDevice<Args...>::value && - std::is_constructible<T, Args...>::value, + std::is_constructible_v<T, Args...>, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...))) : m_ebo_object(std::forward<Args>(args)...) {} template <class... Args, class _ignored = void, - std::enable_if_t<std::is_void<_ignored>::value && + std::enable_if_t<std::is_void_v<_ignored> && CTorsNotOnDevice<Args...>::value && - std::is_constructible<T, Args...>::value, + std::is_constructible_v<T, Args...>, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( noexcept(T(std::forward<Args>(args)...))) @@ -167,9 +167,9 @@ struct EBOBaseImpl<T, false, CTorsNotOnDevice> { template <class T, template <class...> class CtorsNotOnDevice = NoCtorsNotOnDevice> struct StandardLayoutNoUniqueAddressMemberEmulation - : EBOBaseImpl<T, std::is_empty<T>::value, CtorsNotOnDevice> { + : EBOBaseImpl<T, std::is_empty_v<T>, CtorsNotOnDevice> { private: - using ebo_base_t = EBOBaseImpl<T, std::is_empty<T>::value, CtorsNotOnDevice>; + using ebo_base_t = EBOBaseImpl<T, std::is_empty_v<T>, CtorsNotOnDevice>; public: using ebo_base_t::ebo_base_t; diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp index 4babe2d72bd148b7101ce8c28c1acd9d6267aa1d..0dcd5d523d3c530b51ddd55668e299e65bbabfe5 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp @@ -18,125 +18,54 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include <cstring> -#include <cstdlib> - -#include <ostream> -#include <sstream> +#include <iostream> #include <iomanip> +#include <sstream> #include <stdexcept> +#include <Kokkos_Core.hpp> // show_warnings #include <impl/Kokkos_Error.hpp> -#include <Cuda/Kokkos_Cuda_Error.hpp> -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +void Kokkos::Impl::throw_runtime_exception(const std::string &msg) { + throw std::runtime_error(msg); +} -namespace Kokkos { -namespace Impl { +void Kokkos::Impl::throw_bad_alloc(std::string_view memory_space_name, + std::size_t size, std::string_view label) { + std::stringstream ss; + ss << "Kokkos ERROR: " << memory_space_name + << " memory space failed to allocate " << human_memory_size(size) + << " (label=\"" << label << "\")."; + throw std::runtime_error(ss.str()); +} -void throw_runtime_exception(const std::string &msg) { - throw std::runtime_error(msg); +void Kokkos::Impl::log_warning(const std::string &msg) { + if (show_warnings()) { + std::cerr << msg << std::flush; + } } -std::string human_memory_size(size_t arg_bytes) { +std::string Kokkos::Impl::human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; const double M = K * 1024; const double G = M * 1024; + const double T = G * 1024; std::ostringstream out; if (bytes < K) { out << std::setprecision(4) << bytes << " B"; } else if (bytes < M) { bytes /= K; - out << std::setprecision(4) << bytes << " K"; + out << std::setprecision(4) << bytes << " KiB"; } else if (bytes < G) { bytes /= M; - out << std::setprecision(4) << bytes << " M"; - } else { + out << std::setprecision(4) << bytes << " MiB"; + } else if (bytes < T) { bytes /= G; - out << std::setprecision(4) << bytes << " G"; - } - return out.str(); -} - -} // namespace Impl - -void Experimental::RawMemoryAllocationFailure::print_error_message( - std::ostream &o) const { - o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); - o << " failed"; - switch (m_failure_mode) { - case FailureMode::OutOfMemoryError: - o << ", likely due to insufficient memory."; - break; - case FailureMode::AllocationNotAligned: - o << " because the allocation was improperly aligned."; - break; - case FailureMode::InvalidAllocationSize: - o << " because the requested allocation size is not a valid size for the" - " requested allocation mechanism (it's probably too large)."; - break; - // TODO move this to the subclass for Cuda-related things - case FailureMode::MaximumCudaUVMAllocationsExceeded: - o << " because the maximum Cuda UVM allocations was exceeded."; - break; - case FailureMode::Unknown: o << " because of an unknown error."; break; - } - o << " (The allocation mechanism was "; - switch (m_mechanism) { - case AllocationMechanism::StdMalloc: o << "standard malloc()."; break; - case AllocationMechanism::CudaMalloc: o << "cudaMalloc()."; break; - case AllocationMechanism::CudaMallocManaged: - o << "cudaMallocManaged()."; - break; - case AllocationMechanism::CudaHostAlloc: o << "cudaHostAlloc()."; break; - case AllocationMechanism::HIPMalloc: o << "hipMalloc()."; break; - case AllocationMechanism::HIPHostMalloc: o << "hipHostMalloc()."; break; - case AllocationMechanism::HIPMallocManaged: - o << "hipMallocManaged()."; - break; - case AllocationMechanism::SYCLMallocDevice: - o << "sycl::malloc_device()."; - break; - case AllocationMechanism::SYCLMallocShared: - o << "sycl::malloc_shared()."; - break; - case AllocationMechanism::SYCLMallocHost: - o << "sycl::malloc_host()."; - break; - default: o << "unsupported."; + out << std::setprecision(4) << bytes << " GiB"; + } else { + bytes /= T; + out << std::setprecision(4) << bytes << " TiB"; } - append_additional_error_information(o); - o << ")" << std::endl; -} - -std::string Experimental::RawMemoryAllocationFailure::get_error_message() - const { - std::ostringstream out; - print_error_message(out); return out.str(); } - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -#ifdef KOKKOS_ENABLE_CUDA -namespace Experimental { - -void CudaRawMemoryAllocationFailure::append_additional_error_information( - std::ostream &o) const { - if (m_error_code != cudaSuccess) { - o << " The Cuda allocation returned the error code \"" - << cudaGetErrorName(m_error_code) << "\"."; - } -} - -} // end namespace Experimental -#endif - -} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp index 3d0b1d3274c83a3178b5d72ae01299183fc4a6a7..9a80c7b31b8280828f2712931a1ada5be66fd154 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp @@ -18,113 +18,19 @@ #define KOKKOS_IMPL_ERROR_HPP #include <string> -#include <iosfwd> #include <Kokkos_Macros.hpp> #include <Kokkos_Abort.hpp> #include <Kokkos_Assert.hpp> -namespace Kokkos { -namespace Impl { +namespace Kokkos::Impl { [[noreturn]] void throw_runtime_exception(const std::string &msg); +[[noreturn]] void throw_bad_alloc(std::string_view memory_space_name, + std::size_t size, std::string_view label); +void log_warning(const std::string &msg); -std::string human_memory_size(size_t arg_bytes); +std::string human_memory_size(size_t bytes); -} // namespace Impl +} // namespace Kokkos::Impl -namespace Experimental { - -class RawMemoryAllocationFailure : public std::bad_alloc { - public: - enum class FailureMode { - OutOfMemoryError, - AllocationNotAligned, - InvalidAllocationSize, - MaximumCudaUVMAllocationsExceeded, - Unknown - }; - enum class AllocationMechanism { - StdMalloc, -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - PosixMemAlign KOKKOS_DEPRECATED, - PosixMMap KOKKOS_DEPRECATED, - IntelMMAlloc KOKKOS_DEPRECATED, #endif - CudaMalloc, - CudaMallocManaged, - CudaHostAlloc, - HIPMalloc, - HIPHostMalloc, - HIPMallocManaged, - SYCLMallocDevice, - SYCLMallocShared, - SYCLMallocHost - }; - - private: - size_t m_attempted_size; - size_t m_attempted_alignment; - FailureMode m_failure_mode; - AllocationMechanism m_mechanism; - - public: - RawMemoryAllocationFailure( - size_t arg_attempted_size, size_t arg_attempted_alignment, - FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, - AllocationMechanism arg_mechanism = - AllocationMechanism::StdMalloc) noexcept - : m_attempted_size(arg_attempted_size), - m_attempted_alignment(arg_attempted_alignment), - m_failure_mode(arg_failure_mode), - m_mechanism(arg_mechanism) {} - - RawMemoryAllocationFailure() noexcept = delete; - - RawMemoryAllocationFailure(RawMemoryAllocationFailure const &) noexcept = - default; - RawMemoryAllocationFailure(RawMemoryAllocationFailure &&) noexcept = default; - - RawMemoryAllocationFailure &operator =( - RawMemoryAllocationFailure const &) noexcept = default; - RawMemoryAllocationFailure &operator =( - RawMemoryAllocationFailure &&) noexcept = default; - - ~RawMemoryAllocationFailure() noexcept override = default; - - [[nodiscard]] const char *what() const noexcept override { - if (m_failure_mode == FailureMode::OutOfMemoryError) { - return "Memory allocation error: out of memory"; - } else if (m_failure_mode == FailureMode::AllocationNotAligned) { - return "Memory allocation error: allocation result was under-aligned"; - } - - return nullptr; // unreachable - } - - [[nodiscard]] size_t attempted_size() const noexcept { - return m_attempted_size; - } - - [[nodiscard]] size_t attempted_alignment() const noexcept { - return m_attempted_alignment; - } - - [[nodiscard]] AllocationMechanism allocation_mechanism() const noexcept { - return m_mechanism; - } - - [[nodiscard]] FailureMode failure_mode() const noexcept { - return m_failure_mode; - } - - void print_error_message(std::ostream &o) const; - [[nodiscard]] std::string get_error_message() const; - - virtual void append_additional_error_information(std::ostream &) const {} -}; - -} // end namespace Experimental - -} // namespace Kokkos - -#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index 04c5e0bd22a29abeea7b6b835e74d5d3c80992e0..58a5de2aa626af6a14f2809bbdf835bf2b933e42 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -41,7 +41,7 @@ void team_policy_check_valid_storage_level_argument(int level) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level << ", ...) storage level argument must be 0 or 1 to be valid\n"; - Impl::throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } } diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp index 58ed54275a64d6a8915e89f035d2140996f37ae5..5805b78ee75b9c27fd04532a9f628b6b6b1789a1 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -123,14 +123,14 @@ template <class ExecutionSpace> struct ExecSpaceDerived : ExecSpaceBase { static_assert(check_valid_execution_space<ExecutionSpace>()); static_assert(check_is_regular<ExecutionSpace>()); - void initialize(InitializationSettings const& settings) final { + void initialize(InitializationSettings const& settings) override final { ExecutionSpace::impl_initialize(settings); } - void finalize() final { ExecutionSpace::impl_finalize(); } - void static_fence(std::string const& label) final { + void finalize() override final { ExecutionSpace::impl_finalize(); } + void static_fence(std::string const& label) override final { ExecutionSpace::impl_static_fence(label); } - void print_configuration(std::ostream& os, bool verbose) final { + void print_configuration(std::ostream& os, bool verbose) override final { ExecutionSpace().print_configuration(os, verbose); } }; diff --git a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp deleted file mode 100644 index 4726a87b97cb2bc79ccbd267cf67927fab315cd4..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp +++ /dev/null @@ -1,279 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP -#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP - -#include <Kokkos_Core_fwd.hpp> -#include <Kokkos_Atomic.hpp> - -#include <Kokkos_PointerOwnership.hpp> -#include <impl/Kokkos_SimpleTaskScheduler.hpp> - -namespace Kokkos { -namespace Impl { - -template <class DeviceType, size_t Size, size_t Align = 1, - class SizeType = typename DeviceType::execution_space::size_type> -class FixedBlockSizeMemoryPool - : private MemorySpaceInstanceStorage<typename DeviceType::memory_space> { - public: - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - - private: - using memory_space_storage_base = - MemorySpaceInstanceStorage<typename DeviceType::memory_space>; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord<memory_space>; - - struct alignas(Align) Block { - union { - char ignore; - char data[Size]; - }; - }; - - static constexpr auto actual_size = sizeof(Block); - - // TODO shared allocation tracker - // TODO @optimization put the index values on different cache lines (CPU) or - // pages (GPU)? - - tracker_type m_tracker = {}; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - Kokkos::OwningRawPtr<Block> m_first_block = nullptr; - Kokkos::OwningRawPtr<size_type> m_free_indices = nullptr; - - enum : size_type { IndexInUse = ~size_type(0) }; - - public: - FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) - : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = - record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", - num_blocks * sizeof(size_type)); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for (size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool(memory_space const& mem_space, - size_t mempool_capacity, unsigned, unsigned, - unsigned) - : FixedBlockSizeMemoryPool( - mem_space, mempool_capacity / - actual_size) { /* forwarding ctor, must be empty */ - } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool const&) = default; - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept { - (void)alloc_size; - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], - current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if (free_idx == IndexInUse) { - return nullptr; - } else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && - offset / actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } -}; - -#if 0 -template < - class DeviceType, - size_t Size, - size_t Align=1, - class SizeType = typename DeviceType::execution_space::size_type -> -class FixedBlockSizeChaseLevMemoryPool - : private MemorySpaceInstanceStorage<typename DeviceType::memory_space> -{ -public: - - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - -private: - - using memory_space_storage_base = MemorySpaceInstanceStorage<typename DeviceType::memory_space>; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord<memory_space>; - - struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; - - static constexpr auto actual_size = sizeof(Block); - - tracker_type m_tracker = { }; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - - - enum : size_type { IndexInUse = ~size_type(0) }; - -public: - - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_type num_blocks - ) : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) - { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) - ); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) - ); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for(size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_t mempool_capacity, - unsigned, unsigned, unsigned - ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) - { /* forwarding ctor, must be empty */ } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; - - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept - { - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = - Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if(free_idx == IndexInUse) { - return nullptr; - } - else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type alloc_size) const noexcept - { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } - -}; -#endif - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e844a5295e504675dc8c07c7b736e0e28d4226cc..29a365e6e41828bd4c9b4f46b1522dc95f701fa7 100644 --- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -118,8 +118,8 @@ struct FunctorAnalysis { using functor_has_space = has_execution_space<Functor>; static_assert(!policy_has_space::value || !functor_has_space::value || - std::is_same<typename policy_has_space::type, - typename functor_has_space::type>::value, + std::is_same_v<typename policy_has_space::type, + typename functor_has_space::type>, "Execution Policy and Functor execution space must match"); //---------------------------------------- @@ -136,9 +136,8 @@ struct FunctorAnalysis { typename std::is_void<typename F::value_type>::type> { using type = typename F::value_type; - static_assert(!std::is_reference<type>::value && - std::rank<type>::value <= 1 && - std::extent<type>::value == 0, + static_assert(!std::is_reference_v<type> && std::rank_v<type> <= 1 && + std::extent_v<type> == 0, "Kokkos Functor::value_type is T or T[]"); }; @@ -149,7 +148,7 @@ struct FunctorAnalysis { template <typename F, typename P = PatternInterface, typename V = typename has_value_type<F>::type, - bool T = std::is_void<Tag>::value> + bool T = std::is_void_v<Tag>> struct deduce_value_type { using type = V; }; @@ -290,8 +289,8 @@ struct FunctorAnalysis { using candidate_type = typename deduce_value_type<Functor>::type; enum { - candidate_is_void = std::is_void<candidate_type>::value, - candidate_is_array = std::rank<candidate_type>::value == 1 + candidate_is_void = std::is_void_v<candidate_type>, + candidate_is_array = std::rank_v<candidate_type> == 1 }; //---------------------------------------- @@ -306,7 +305,7 @@ struct FunctorAnalysis { using value_type = std::remove_extent_t<candidate_type>; - static_assert(!std::is_const<value_type>::value, + static_assert(!std::is_const_v<value_type>, "Kokkos functor operator reduce argument cannot be const"); private: @@ -614,21 +613,20 @@ struct FunctorAnalysis { }; template <class F> - struct DeduceJoinNoTag<F, std::enable_if_t<(is_reducer<F>::value || - (!is_reducer<F>::value && - std::is_void<Tag>::value)) && - detected_join_no_tag<F>::value>> + struct DeduceJoinNoTag< + F, std::enable_if_t<(is_reducer<F>::value || + (!is_reducer<F>::value && std::is_void_v<Tag>)) && + detected_join_no_tag<F>::value>> : public has_join_no_tag_function<F> { enum : bool { value = true }; }; template <class F> struct DeduceJoinNoTag< - F, - std::enable_if_t<(is_reducer<F>::value || - (!is_reducer<F>::value && std::is_void<Tag>::value)) && - (!detected_join_no_tag<F>::value && - detected_volatile_join_no_tag<F>::value)>> + F, std::enable_if_t<(is_reducer<F>::value || + (!is_reducer<F>::value && std::is_void_v<Tag>)) && + (!detected_join_no_tag<F>::value && + detected_volatile_join_no_tag<F>::value)>> : public has_volatile_join_no_tag_function<F> { enum : bool { value = true }; static_assert(Impl::dependent_false_v<F>, @@ -735,8 +733,8 @@ struct FunctorAnalysis { template <class F> struct DeduceInitNoTag< - F, std::enable_if_t<is_reducer<F>::value || (!is_reducer<F>::value && - std::is_void<Tag>::value), + F, std::enable_if_t<is_reducer<F>::value || + (!is_reducer<F>::value && std::is_void_v<Tag>), decltype(has_init_no_tag_function<F>::enable_if( &F::init))>> : public has_init_no_tag_function<F> { @@ -835,8 +833,8 @@ struct FunctorAnalysis { template <class F> struct DeduceFinalNoTag< - F, std::enable_if_t<is_reducer<F>::value || (!is_reducer<F>::value && - std::is_void<Tag>::value), + F, std::enable_if_t<is_reducer<F>::value || + (!is_reducer<F>::value && std::is_void_v<Tag>), decltype(has_final_no_tag_function<F>::enable_if( &F::final))>> : public has_final_no_tag_function<F> { @@ -906,14 +904,14 @@ struct FunctorAnalysis { Functor m_functor; template <bool IsArray> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<IsArray, int> len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<IsArray, int> len() + const noexcept { return m_functor.value_count; } template <bool IsArray> - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<!IsArray, int> len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t<!IsArray, int> len() + const noexcept { return candidate_is_void ? 0 : 1; } @@ -973,8 +971,8 @@ struct FunctorAnalysis { DeduceJoin<>::join(&m_functor, dst, src); } - KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const - noexcept { + KOKKOS_INLINE_FUNCTION reference_type + init(ValueType* const dst) const noexcept { DeduceInit<>::init(&m_functor, dst); return reference(dst); } @@ -987,11 +985,11 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION const Functor& get_functor() const { return m_functor; } - Reducer(Reducer const&) = default; - Reducer(Reducer&&) = default; + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; - Reducer& operator=(Reducer&&) = delete; - ~Reducer() = default; + Reducer& operator=(Reducer&&) = delete; + ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( Functor const& arg_functor) noexcept diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp index 56f95c814d88480b3037e2bd77454562388a9872..6d3ebf64befc40b612dfcc37b2cc4b8db4559d2f 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -56,7 +56,7 @@ struct GraphAccess { static_assert( Kokkos::Impl::is_specialization_of<NodeType, GraphNodeImpl>::value, "Kokkos Internal Error in graph interface"); - return std::make_shared<NodeType>((Args &&) args...); + return std::make_shared<NodeType>((Args&&)args...); } template <class GraphImplWeakPtr, class ExecutionSpace, class Kernel, @@ -83,7 +83,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_node_ptr()`)"); - return ((NodeRef &&) node_ref).get_node_ptr(); + return ((NodeRef&&)node_ref).get_node_ptr(); } template <class NodeRef> @@ -93,7 +93,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_graph_weak_ptr()`)"); - return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + return ((NodeRef&&)node_ref).get_graph_weak_ptr(); } // </editor-fold> end accessors for private members of public interface }}}2 diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp index 2ab05cb8e439f0124670c54001f27658e0a123de..b02a26547223498cff75d1ffbb7ec6032e6eefeb 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -54,9 +54,9 @@ template <template <class, class, class> class Template, class TSrc, class USrc, struct is_compatible_type_erasure< Template<TSrc, USrc, VSrc>, Template<TDst, UDst, VDst>, // Because gcc thinks this is ambiguous, we need to add this: - std::enable_if_t<!std::is_same<TSrc, TDst>::value || - !std::is_same<USrc, UDst>::value || - !std::is_same<VSrc, VDst>::value>> + std::enable_if_t<!std::is_same_v<TSrc, TDst> || + !std::is_same_v<USrc, UDst> || + !std::is_same_v<VSrc, VDst>>> : std::bool_constant<is_compatible_type_erasure<TSrc, TDst>::value && is_compatible_type_erasure<USrc, UDst>::value && is_compatible_type_erasure<VSrc, VDst>::value> {}; diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp index f4513679ad29210894d02dea29604c8b2da4fc19..3fde487e7daf31a2e1c819a1b5530d10bde1572c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp @@ -52,7 +52,7 @@ struct GraphNodeBackendDetailsBeforeTypeErasure { GraphNodeBackendDetailsBeforeTypeErasure( GraphNodeBackendDetailsBeforeTypeErasure&&) = delete; - GraphNodeBackendDetailsBeforeTypeErasure& operator =( + GraphNodeBackendDetailsBeforeTypeErasure& operator=( GraphNodeBackendDetailsBeforeTypeErasure const&) = delete; GraphNodeBackendDetailsBeforeTypeErasure& operator=( diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp index fe8cb89c4fe43ed81aafb2a0930bc6f0b645e948..378e03cb82db43ec35f58e63d44e1f9b5ba9e8ed 100644 --- a/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp @@ -22,7 +22,6 @@ #include <Kokkos_Core_fwd.hpp> #include <Kokkos_Graph_fwd.hpp> -#include <impl/Kokkos_SimpleTaskScheduler.hpp> // ExecutionSpaceInstanceStorage #include <impl/Kokkos_GraphImpl.hpp> #include <impl/Kokkos_GraphNodeCustomization.hpp> @@ -74,8 +73,7 @@ struct GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, template <class... Args> GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, Args&&... args) noexcept - : implementation_base_t(_graph_node_is_root_ctor_tag{}, - (Args &&) args...), + : implementation_base_t(_graph_node_is_root_ctor_tag{}, (Args&&)args...), execution_space_storage_base_t(ex) {} // </editor-fold> end public(-ish) constructors }}}2 @@ -84,11 +82,11 @@ struct GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, //---------------------------------------------------------------------------- // <editor-fold desc="no other constructors"> {{{2 - GraphNodeImpl() = delete; - GraphNodeImpl(GraphNodeImpl const&) = delete; - GraphNodeImpl(GraphNodeImpl&&) = delete; + GraphNodeImpl() = delete; + GraphNodeImpl(GraphNodeImpl const&) = delete; + GraphNodeImpl(GraphNodeImpl&&) = delete; GraphNodeImpl& operator=(GraphNodeImpl const&) = delete; - GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; // </editor-fold> end no other constructors }}}2 //---------------------------------------------------------------------------- @@ -144,23 +142,23 @@ struct GraphNodeImpl<ExecutionSpace, Kernel, template <class KernelDeduced> GraphNodeImpl(ExecutionSpace const& ex, _graph_node_kernel_ctor_tag, KernelDeduced&& arg_kernel) - : base_t(ex), m_kernel((KernelDeduced &&) arg_kernel) {} + : base_t(ex), m_kernel((KernelDeduced&&)arg_kernel) {} template <class... Args> GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, Args&&... args) - : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args &&) args...) {} + : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args&&)args...) {} //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // <editor-fold desc="Rule of 6 for not copyable or movable"> {{{3 // Not copyable or movable - GraphNodeImpl() = delete; - GraphNodeImpl(GraphNodeImpl const&) = delete; - GraphNodeImpl(GraphNodeImpl&&) = delete; + GraphNodeImpl() = delete; + GraphNodeImpl(GraphNodeImpl const&) = delete; + GraphNodeImpl(GraphNodeImpl&&) = delete; GraphNodeImpl& operator=(GraphNodeImpl const&) = delete; - GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; - ~GraphNodeImpl() override = default; + GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; + ~GraphNodeImpl() override = default; // </editor-fold> end Rule of 6 for not copyable or movable }}}3 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -228,33 +226,32 @@ struct GraphNodeImpl // <editor-fold desc="Ctors, destructors, and assignment"> {{{2 // Not copyable or movable - GraphNodeImpl() = delete; - GraphNodeImpl(GraphNodeImpl const&) = delete; - GraphNodeImpl(GraphNodeImpl&&) = delete; + GraphNodeImpl() = delete; + GraphNodeImpl(GraphNodeImpl const&) = delete; + GraphNodeImpl(GraphNodeImpl&&) = delete; GraphNodeImpl& operator=(GraphNodeImpl const&) = delete; - GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; - ~GraphNodeImpl() override = default; + GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; + ~GraphNodeImpl() override = default; // Normal kernel-and-predecessor constructor template <class KernelDeduced, class PredecessorPtrDeduced> GraphNodeImpl(ExecutionSpace const& ex, _graph_node_kernel_ctor_tag, KernelDeduced&& arg_kernel, _graph_node_predecessor_ctor_tag, PredecessorPtrDeduced&& arg_predecessor) - : base_t(ex, _graph_node_kernel_ctor_tag{}, - (KernelDeduced &&) arg_kernel), + : base_t(ex, _graph_node_kernel_ctor_tag{}, (KernelDeduced&&)arg_kernel), // The backend gets the ability to store (weak, non-owning) references // to the kernel in it's final resting place here if it wants. The // predecessor is already a pointer, so it doesn't matter that it isn't // already at its final address backend_details_base_t(ex, this->base_t::get_kernel(), arg_predecessor, *this), - m_predecessor_ref((PredecessorPtrDeduced &&) arg_predecessor) {} + m_predecessor_ref((PredecessorPtrDeduced&&)arg_predecessor) {} // Root-tagged constructor template <class... Args> GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, Args&&... args) - : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args &&) args...), + : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args&&)args...), backend_details_base_t(ex, _graph_node_is_root_ctor_tag{}, *this), m_predecessor_ref() {} diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp deleted file mode 100644 index cd640b88cb92ac56a3ec1914d13d95882a6a3a86..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ /dev/null @@ -1,313 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include <Kokkos_Macros.hpp> - -#include <cstddef> -#include <cstdlib> -#include <cstdint> -#include <cstring> - -#include <iostream> -#include <sstream> -#include <cstring> -#include <algorithm> - -#include <Kokkos_HBWSpace.hpp> -#include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_MemorySpace.hpp> -#include <Kokkos_Atomic.hpp> -#ifdef KOKKOS_ENABLE_HBWSPACE -#include <memkind.h> -#endif - -#include <impl/Kokkos_Tools.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#ifdef KOKKOS_ENABLE_HBWSPACE -#define MEMKIND_TYPE MEMKIND_HBW // hbw_get_kind(HBW_PAGESIZE_4KB) - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { - -/* Default allocation mechanism */ -HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); -} - -/* Default allocation mechanism */ -HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech) - : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init2\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); - if (arg_alloc_mech == STD_MALLOC) { - m_alloc_mech = HBWSpace::STD_MALLOC; - } -} - -void *HBWSpace::allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); -} -void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); -} -void *HBWSpace::impl_allocate( - const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - static_assert(sizeof(void *) == sizeof(uintptr_t), - "Error sizeof(void*) != sizeof(uintptr_t)"); - - static_assert( - Kokkos::Impl::power_of_two<Kokkos::Impl::MEMORY_ALIGNMENT>::value, - "Memory alignment must be power of two"); - - constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; - constexpr uintptr_t alignment_mask = alignment - 1; - - void *ptr = nullptr; - - if (arg_alloc_size) { - if (m_alloc_mech == STD_MALLOC) { - // Over-allocate to and round up to guarantee proper alignment. - size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; - - void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded); - - if (alloc_ptr) { - uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr); - - // offset enough to record the alloc_ptr - address += sizeof(void *); - uintptr_t rem = address % alignment; - uintptr_t offset = rem ? (alignment - rem) : 0u; - address += offset; - ptr = reinterpret_cast<void *>(address); - // record the alloc'd pointer - address -= sizeof(void *); - *reinterpret_cast<void **>(address) = alloc_ptr; - } - } - } - - if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) || - (reinterpret_cast<uintptr_t>(ptr) & alignment_mask)) { - std::ostringstream msg; - msg << "Kokkos::Experimental::HBWSpace::allocate[ "; - switch (m_alloc_mech) { - case STD_MALLOC: msg << "STD_MALLOC"; break; - case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break; - case POSIX_MMAP: msg << "POSIX_MMAP"; break; - case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break; - } - msg << " ]( " << arg_alloc_size << " ) FAILED"; - if (ptr == nullptr) { - msg << " nullptr"; - } else { - msg << " NOT ALIGNED " << ptr; - } - - std::cerr << msg.str() << std::endl; - std::cerr.flush(); - - Kokkos::Impl::throw_runtime_exception(msg.str()); - } - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); - } - - return ptr; -} - -void HBWSpace::deallocate(void *const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); -} -void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); -} -void HBWSpace::impl_deallocate( - const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - if (arg_alloc_ptr) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, - reported_size); - } - - if (m_alloc_mech == STD_MALLOC) { - void *alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) - 1); - memkind_free(MEMKIND_TYPE, alloc_ptr); - } - } -} - -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord<void, void> - SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::s_root_record; -#endif - -void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::deallocate( - SharedAllocationRecord<void, void> *arg_rec) { - delete static_cast<SharedAllocationRecord *>(arg_rec); -} - -SharedAllocationRecord<Kokkos::Experimental::HBWSpace, - void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size, - (SharedAllocationRecord<void, void>::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>:: - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord<void, void>( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::Experimental::HBWSpace, - void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast<SharedAllocationRecord<void, void> *>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - -//---------------------------------------------------------------------------- - -void * -SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::allocate_tracked( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, - void>::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, - Kokkos::Experimental::HBWSpace>( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord<Kokkos::Experimental::HBWSpace, " - "void>::reallocate_tracked(): fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void> - *SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = - SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>; - - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - RecordHost *const record = - head ? static_cast<RecordHost *>(head->m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>:: - print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord<void, void>::print_host_accessible_records( - s, "HBWSpace", &s_root_record, detail); -#else - throw_runtime_exception( - "SharedAllocationRecord<HBWSpace>::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index 4a22898d168257def9a8e984e97b98216f9e1475..467f544cf8e2f7646550e3bdd001187240605e0d 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -196,12 +196,12 @@ KOKKOS_INLINE_FUNCTION template <class T> static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&); + T x, const Kokkos::Impl::half_impl_t::type&); #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template <class T> static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); + T x, const Kokkos::Impl::bhalf_impl_t::type&); #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED template <class T> @@ -283,13 +283,6 @@ class alignas(FloatType) floating_point_wrapper { private: impl_type val; - using fixed_width_integer_type = std::conditional_t< - sizeof(impl_type) == 2, uint16_t, - std::conditional_t< - sizeof(impl_type) == 4, uint32_t, - std::conditional_t<sizeof(impl_type) == 8, uint64_t, void>>>; - static_assert(!std::is_void<fixed_width_integer_type>::value, - "Invalid impl_type"); public: // In-class initialization and defaulted default constructors not used @@ -318,18 +311,6 @@ class alignas(FloatType) floating_point_wrapper { default; #endif - KOKKOS_INLINE_FUNCTION - floating_point_wrapper(const volatile floating_point_wrapper& rhs) { -#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) - val = rhs.val; -#else - const volatile fixed_width_integer_type* rv_ptr = - reinterpret_cast<const volatile fixed_width_integer_type*>(&rhs.val); - const fixed_width_integer_type rv_val = *rv_ptr; - val = reinterpret_cast<const impl_type&>(rv_val); -#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - } - KOKKOS_FUNCTION floating_point_wrapper(bit_comparison_type rhs) { val = Kokkos::bit_cast<impl_type>(rhs); @@ -492,15 +473,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - template <class T> - KOKKOS_FUNCTION void operator=(T rhs) volatile { - impl_type new_val = cast_to_wrapper(rhs, val).val; - volatile fixed_width_integer_type* val_ptr = - reinterpret_cast<volatile fixed_width_integer_type*>( - const_cast<impl_type*>(&val)); - *val_ptr = reinterpret_cast<fixed_width_integer_type&>(new_val); - } - // Compound operators KOKKOS_FUNCTION floating_point_wrapper& operator+=(floating_point_wrapper rhs) { @@ -515,15 +487,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator+=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs += tmp_rhs; - *this = tmp_lhs; - } - // Compound operators: upcast overloads for += template <class T> KOKKOS_FUNCTION friend std::enable_if_t< @@ -560,15 +523,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator-=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs -= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for -= template <class T> KOKKOS_FUNCTION friend std::enable_if_t< @@ -605,15 +559,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator*=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs *= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for *= template <class T> KOKKOS_FUNCTION friend std::enable_if_t< @@ -650,15 +595,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator/=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs /= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for /= template <class T> KOKKOS_FUNCTION friend std::enable_if_t< @@ -884,27 +820,6 @@ class alignas(FloatType) floating_point_wrapper { #endif } - KOKKOS_FUNCTION - friend bool operator==(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs == tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator!=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs != tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs < tmp_rhs; - } - template <class T> KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> && (std::is_same_v<T, float> || @@ -923,13 +838,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs < static_cast<float>(rhs); } - KOKKOS_FUNCTION - friend bool operator>(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs > tmp_rhs; - } - template <class T> KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> && (std::is_same_v<T, float> || @@ -948,13 +856,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs > static_cast<float>(rhs); } - KOKKOS_FUNCTION - friend bool operator<=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs <= tmp_rhs; - } - template <class T> KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> && (std::is_same_v<T, float> || @@ -973,13 +874,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs <= static_cast<float>(rhs); } - KOKKOS_FUNCTION - friend bool operator>=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs >= tmp_rhs; - } - template <class T> KOKKOS_FUNCTION friend std::enable_if_t<std::is_convertible_v<T, float> && (std::is_same_v<T, float> || @@ -1018,14 +912,14 @@ class alignas(FloatType) floating_point_wrapper { // Declare wrapper overloads now that floating_point_wrapper is declared template <class T> static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&) { + T x, const Kokkos::Impl::half_impl_t::type&) { return Kokkos::Experimental::cast_to_half(x); } #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template <class T> static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { + T x, const Kokkos::Impl::bhalf_impl_t::type&) { return Kokkos::Experimental::cast_to_bhalf(x); } #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED @@ -1093,13 +987,12 @@ half_t cast_to_half(unsigned long long val) { return half_t(val); } // example don't include char template <class T> KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same<T, float>::value || std::is_same<T, bool>::value || - std::is_same<T, double>::value || std::is_same<T, short>::value || - std::is_same<T, unsigned short>::value || std::is_same<T, int>::value || - std::is_same<T, unsigned int>::value || std::is_same<T, long>::value || - std::is_same<T, unsigned long>::value || - std::is_same<T, long long>::value || - std::is_same<T, unsigned long long>::value, + std::is_same_v<T, float> || std::is_same_v<T, bool> || + std::is_same_v<T, double> || std::is_same_v<T, short> || + std::is_same_v<T, unsigned short> || std::is_same_v<T, int> || + std::is_same_v<T, unsigned int> || std::is_same_v<T, long> || + std::is_same_v<T, unsigned long> || std::is_same_v<T, long long> || + std::is_same_v<T, unsigned long long>, T> cast_from_half(half_t val) { return T(val); @@ -1153,13 +1046,12 @@ bhalf_t cast_to_bhalf(unsigned long long val) { return bhalf_t(val); } // cast_from_bhalf template <class T> KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same<T, float>::value || std::is_same<T, bool>::value || - std::is_same<T, double>::value || std::is_same<T, short>::value || - std::is_same<T, unsigned short>::value || std::is_same<T, int>::value || - std::is_same<T, unsigned int>::value || std::is_same<T, long>::value || - std::is_same<T, unsigned long>::value || - std::is_same<T, long long>::value || - std::is_same<T, unsigned long long>::value, + std::is_same_v<T, float> || std::is_same_v<T, bool> || + std::is_same_v<T, double> || std::is_same_v<T, short> || + std::is_same_v<T, unsigned short> || std::is_same_v<T, int> || + std::is_same_v<T, unsigned int> || std::is_same_v<T, long> || + std::is_same_v<T, unsigned long> || std::is_same_v<T, long long> || + std::is_same_v<T, unsigned long long>, T> cast_from_bhalf(bhalf_t val) { return T(val); diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp index 76500bc56339088691952fcdfe1876edc960fd33..c423468f460de302d226858f8ae5e2acae3dff84 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp @@ -155,15 +155,15 @@ class HostBarrier { KOKKOS_INLINE_FUNCTION void wait() const noexcept { wait(m_buffer, m_size, m_step); } - HostBarrier() = default; - HostBarrier(HostBarrier&&) = default; + HostBarrier() = default; + HostBarrier(HostBarrier&&) = default; HostBarrier& operator=(HostBarrier&&) = default; KOKKOS_INLINE_FUNCTION HostBarrier(int size, int* buffer) : m_size{size}, m_step{0u}, m_buffer{buffer} {} - HostBarrier(const HostBarrier&) = delete; + HostBarrier(const HostBarrier&) = delete; HostBarrier& operator=(const HostBarrier&) = delete; private: diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp index a9d72160593741b5334134fc9af847a55a9e5e8f..5f4c66984b3bbeebec3bd99b36e2aae38166e9f7 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -20,23 +20,11 @@ #include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> +#include <Kokkos_HostSpace.hpp> #include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_MemorySpace.hpp> #include <impl/Kokkos_Tools.hpp> -/*--------------------------------------------------------------------------*/ - -#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \ - !defined(KOKKOS_ENABLE_CUDA) - -// Intel specialized allocator does not interoperate with CUDA memory allocation - -#define KOKKOS_ENABLE_INTEL_MM_ALLOC - -#endif - -/*--------------------------------------------------------------------------*/ - #include <cstddef> #include <cstdlib> #include <cstdint> @@ -50,10 +38,6 @@ #include <aligned_new> #endif -#include <Kokkos_HostSpace.hpp> -#include <impl/Kokkos_Error.hpp> -#include <Kokkos_Atomic.hpp> - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -92,25 +76,12 @@ void *HostSpace::impl_allocate( void *ptr = nullptr; if (arg_alloc_size) - ptr = operator new (arg_alloc_size, std::align_val_t(alignment), - std::nothrow_t{}); + ptr = operator new(arg_alloc_size, std::align_val_t(alignment), + std::nothrow_t{}); - if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) || + if (!ptr || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) || (reinterpret_cast<uintptr_t>(ptr) & alignment_mask)) { - Experimental::RawMemoryAllocationFailure::FailureMode failure_mode = - Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned; - if (ptr == nullptr) { - failure_mode = Experimental::RawMemoryAllocationFailure::FailureMode:: - OutOfMemoryError; - } - - Experimental::RawMemoryAllocationFailure::AllocationMechanism alloc_mec = - Experimental::RawMemoryAllocationFailure::AllocationMechanism:: - StdMalloc; - - throw Kokkos::Experimental::RawMemoryAllocationFailure( - arg_alloc_size, alignment, failure_mode, alloc_mec); + Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -125,9 +96,8 @@ void HostSpace::deallocate(void *const arg_alloc_ptr, void HostSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, const size_t arg_alloc_size, - const size_t - - arg_logical_size) const { + const size_t arg_logical_size) const { + if (arg_alloc_ptr) Kokkos::fence("HostSpace::impl_deallocate before free"); impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); } void HostSpace::impl_deallocate( @@ -135,7 +105,6 @@ void HostSpace::impl_deallocate( const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { if (arg_alloc_ptr) { - Kokkos::fence("HostSpace::impl_deallocate before free"); size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; if (Kokkos::Profiling::profileLibraryLoaded()) { @@ -143,91 +112,13 @@ void HostSpace::impl_deallocate( reported_size); } constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; - operator delete (arg_alloc_ptr, std::align_val_t(alignment), - std::nothrow_t{}); + operator delete(arg_alloc_ptr, std::align_val_t(alignment), + std::nothrow_t{}); } } } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord<void, void> - SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record; -#endif - -SharedAllocationRecord<Kokkos::HostSpace, void>::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord<void, void>::m_alloc_ptr, - SharedAllocationRecord<void, void>::m_alloc_size, - (SharedAllocationRecord<void, void>::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast<SharedAllocationHeader *>( - space.allocate(alloc_size)); - } catch (Experimental::RawMemoryAllocationFailure const &failure) { - if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure:: - FailureMode::AllocationNotAligned) { - // TODO: delete the misaligned memory - } - - std::cerr << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space.name() - << " failed with the following error: "; - failure.print_error_message(std::cerr); - std::cerr.flush(); - Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); - } - return nullptr; // unreachable -} - -SharedAllocationRecord<Kokkos::HostSpace, void>::SharedAllocationRecord( - const Kokkos::HostSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord<void, void>::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//============================================================================== -// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 - #include <impl/Kokkos_SharedAlloc_timpl.hpp> -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon<Kokkos::HostSpace>; - -} // end namespace Impl -} // end namespace Kokkos - -// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HostSpace); diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index f740c408fb8fddefff501f46937e1942aa691bff..40111d3c4ae2ecaa8798da916c7e961386763159 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -24,23 +24,15 @@ namespace Kokkos { namespace Impl { -template <class T, class... P> -struct ZeroMemset<HostSpace::execution_space, View<T, P...>> { - ZeroMemset(const HostSpace::execution_space& exec, const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { +template <> +struct ZeroMemset<HostSpace::execution_space> { + ZeroMemset(const HostSpace::execution_space& exec, void* dst, size_t cnt) { // Host spaces, except for HPX, are synchronous and we need to fence for HPX // since we can't properly enqueue a std::memset otherwise. // We can't use exec.fence() directly since we don't have a full definition // of HostSpace here. hostspace_fence(exec); - using ValueType = typename View<T, P...>::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } - - ZeroMemset(const View<T, P...>& dst, - typename View<T, P...>::const_value_type&) { - using ValueType = typename View<T, P...>::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + std::memset(dst, 0, cnt); } }; diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp index 84f525061ee3470bdbe716bb228b9ab00c8e26cd..e6fd80e9b8e339f5e122b0de7f9f284f02cc9d46 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp @@ -43,10 +43,10 @@ void hostspace_parallel_deepcopy_async(void* dst, const void* src, "Kokkos::Impl::hostspace_parallel_deepcopy_async: fence after copy"); } -void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec, - void* dst, const void* src, - ptrdiff_t n) { - using policy_t = Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>; +template <typename ExecutionSpace> +void hostspace_parallel_deepcopy_async(const ExecutionSpace& exec, void* dst, + const void* src, ptrdiff_t n) { + using policy_t = Kokkos::RangePolicy<ExecutionSpace>; // If the asynchronous HPX backend is enabled, do *not* copy anything // synchronously. The deep copy must be correctly sequenced with respect to @@ -55,8 +55,7 @@ void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec, #if !(defined(KOKKOS_ENABLE_HPX) && \ defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH)) constexpr int host_deep_copy_serial_limit = 10 * 8192; - if ((n < host_deep_copy_serial_limit) || - (DefaultHostExecutionSpace().concurrency() == 1)) { + if ((n < host_deep_copy_serial_limit) || (exec.concurrency() == 1)) { if (0 < n) std::memcpy(dst, src, n); return; } @@ -138,6 +137,18 @@ void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec, } } +// Explicit instantiation +template void hostspace_parallel_deepcopy_async<DefaultHostExecutionSpace>( + const DefaultHostExecutionSpace&, void*, const void*, ptrdiff_t); + +#if defined(KOKKOS_ENABLE_SERIAL) && \ + (defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) || \ + defined(KOKKOS_ENABLE_HPX)) +// Instantiate only if both the Serial backend and some other host parallel +// backend are enabled +template void hostspace_parallel_deepcopy_async<Kokkos::Serial>( + const Kokkos::Serial&, void*, const void*, ptrdiff_t); +#endif } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp index 2bb4c3e99b34fc4ef7fc6506fd42c61d1812b8c2..aeedff0167d3e89c7d6299e5390898e205a7bd5a 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp @@ -28,9 +28,9 @@ void hostspace_fence(const DefaultHostExecutionSpace& exec); void hostspace_parallel_deepcopy(void* dst, const void* src, ptrdiff_t n); // DeepCopy called with an execution space that can't access HostSpace void hostspace_parallel_deepcopy_async(void* dst, const void* src, ptrdiff_t n); -void hostspace_parallel_deepcopy_async(const DefaultHostExecutionSpace& exec, - void* dst, const void* src, ptrdiff_t n); - +template <typename ExecutionSpace> +void hostspace_parallel_deepcopy_async(const ExecutionSpace& exec, void* dst, + const void* src, ptrdiff_t n); } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp index bfe5902bf7ffd7665e1db721fa5ae1a9fe2d6b0e..4bb7f639b7335b39a221d1b333ac3e4d05dd482c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -22,7 +22,6 @@ #include <Kokkos_Macros.hpp> #include <impl/Kokkos_HostThreadTeam.hpp> #include <impl/Kokkos_Error.hpp> -#include <impl/Kokkos_Spinwait.hpp> //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -131,10 +130,10 @@ int HostThreadTeamData::organize_team(const int team_size) { // zombi team around (for example m_pool_size = 5 and team_size = 2 // (ii) if team_alloc > team_size then the last team might have less // threads than the others - m_team_rank = (team_base_rank + team_size <= m_pool_size) && + m_team_rank = (team_base_rank + team_size <= m_pool_size) && (team_alloc_rank < team_size) - ? team_alloc_rank - : -1; + ? team_alloc_rank + : -1; m_team_size = team_size; m_team_alloc = team_alloc_size; m_league_rank = league_rank; diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index 51f25a8b60f141e8edac0082a34379e2d596f20b..a0175f9107d02d1063230ff8d0b12fdf3555137d 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -106,7 +106,15 @@ class HostThreadTeamData { public: inline bool team_rendezvous() const noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_rendezvous != 0: + int* ptr = m_team_scratch == nullptr + ? nullptr + : reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous); +#else int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous); +#endif HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step); if (m_team_rank != 0) { HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step); @@ -131,8 +139,16 @@ class HostThreadTeamData { inline void team_rendezvous_release() const noexcept { HostBarrier::split_release( - reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), m_team_size, - m_team_rendezvous_step); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_rendezvous != 0: + (m_team_scratch == nullptr) + ? nullptr + : reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), +#else + reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), +#endif + m_team_size, m_team_rendezvous_step); } inline int pool_rendezvous() const noexcept { @@ -271,6 +287,11 @@ class HostThreadTeamData { } int64_t* team_shared() const noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_shared != 0 + if (m_team_scratch == nullptr) return nullptr; +#endif return m_team_scratch + m_team_shared; } @@ -347,7 +368,7 @@ class HostThreadTeamData { int const num = (m_work_end + m_work_chunk - 1) / m_work_chunk; int const part = (num + m_league_size - 1) / m_league_size; - m_work_range.first = part * m_league_rank; + m_work_range.first = static_cast<int64_t>(part) * m_league_rank; m_work_range.second = m_work_range.first + part; // Steal from next team, round robin @@ -373,7 +394,7 @@ class HostThreadTeamData { const int i = get_work_stealing(); if (0 <= i) { - x.first = m_work_chunk * i; + x.first = static_cast<int64_t>(m_work_chunk) * i; x.second = x.first + m_work_chunk < m_work_end ? x.first + m_work_chunk : m_work_end; } @@ -401,10 +422,21 @@ class HostThreadTeamMember { public: constexpr HostThreadTeamMember(HostThreadTeamData& arg_data) noexcept - : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes()), + : m_scratch( + arg_data.team_shared(), +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // FIXME_OPENMP The tasking framework creates an instance with + // m_team_scratch == nullptr and m_team_shared != 0: + (arg_data.team_shared() == nullptr) ? 0 + : arg_data.team_shared_bytes() +#else + arg_data.team_shared_bytes() +#endif + ), m_data(arg_data), m_league_rank(arg_data.m_league_rank), - m_league_size(arg_data.m_league_size) {} + m_league_size(arg_data.m_league_size) { + } constexpr HostThreadTeamMember(HostThreadTeamData& arg_data, int const arg_league_rank, @@ -415,11 +447,11 @@ class HostThreadTeamMember { m_league_rank(arg_league_rank), m_league_size(arg_league_size) {} - ~HostThreadTeamMember() = default; - HostThreadTeamMember() = delete; - HostThreadTeamMember(HostThreadTeamMember&&) = default; - HostThreadTeamMember(HostThreadTeamMember const&) = default; - HostThreadTeamMember& operator=(HostThreadTeamMember&&) = default; + ~HostThreadTeamMember() = default; + HostThreadTeamMember() = delete; + HostThreadTeamMember(HostThreadTeamMember&&) = default; + HostThreadTeamMember(HostThreadTeamMember const&) = default; + HostThreadTeamMember& operator=(HostThreadTeamMember&&) = default; HostThreadTeamMember& operator=(HostThreadTeamMember const&) = default; //---------------------------------------- @@ -466,9 +498,8 @@ class HostThreadTeamMember { //-------------------------------------------------------------------------- template <typename T> - KOKKOS_INLINE_FUNCTION void team_broadcast(T& value, - const int source_team_rank) const - noexcept { + KOKKOS_INLINE_FUNCTION void team_broadcast( + T& value, const int source_team_rank) const noexcept { KOKKOS_IF_ON_HOST((if (1 < m_data.m_team_size) { T* const shared_value = (T*)m_data.team_reduce(); @@ -498,9 +529,8 @@ class HostThreadTeamMember { //-------------------------------------------------------------------------- template <class Closure, typename T> - KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, T& value, - const int source_team_rank) const - noexcept { + KOKKOS_INLINE_FUNCTION void team_broadcast( + Closure const& f, T& value, const int source_team_rank) const noexcept { KOKKOS_IF_ON_HOST(( T* const shared_value = (T*)m_data.team_reduce(); @@ -537,18 +567,34 @@ class HostThreadTeamMember { // team_reduce( Max(result) ); template <typename ReducerType> - KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<ReducerType>> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template <typename ReducerType> - KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<ReducerType>> team_reduce(ReducerType const& reducer, typename ReducerType::value_type contribution) const noexcept { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<Kokkos::DefaultHostExecutionSpace>, ReducerType, + value_type>::Reducer; + + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution; + } + + template <typename WrappedReducerType> + KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer_v<WrappedReducerType>> + impl_team_reduce( + WrappedReducerType const& reducer, + typename WrappedReducerType::value_type& contribution) const { KOKKOS_IF_ON_HOST(( + if (1 < m_data.m_team_size) { - using value_type = typename ReducerType::value_type; + using value_type = typename WrappedReducerType::value_type; if (0 != m_data.m_team_rank) { // Non-root copies to their local buffer: @@ -570,22 +616,22 @@ class HostThreadTeamMember { value_type* const src = (value_type*)m_data.team_member(i)->team_reduce_local(); - reducer.join(contribution, *src); + reducer.join(&contribution, src); } // Copy result to root member's buffer: // reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() // ); *((value_type*)m_data.team_reduce()) = contribution; - reducer.reference() = contribution; + m_data.team_rendezvous_release(); // This thread released all other threads from 'team_rendezvous' // with a return value of 'false' } else { // Copy from root member's buffer: - reducer.reference() = *((value_type*)m_data.team_reduce()); + contribution = *((value_type*)m_data.team_reduce()); } - } else { reducer.reference() = contribution; })) + })) KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution; Kokkos::abort("HostThreadTeamMember team_reduce\n");)) @@ -768,15 +814,25 @@ KOKKOS_INLINE_FUNCTION parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries, Closure const& closure, Reducer const& reducer) { - typename Reducer::value_type value; - reducer.init(value); + using value_type = typename Reducer::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<Kokkos::DefaultHostExecutionSpace>, Reducer, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { closure(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template <typename iType, typename Closure, typename ValueType, typename Member> @@ -786,17 +842,24 @@ KOKKOS_INLINE_FUNCTION parallel_reduce(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries, Closure const& closure, ValueType& result) { - ValueType val; - Sum<ValueType> reducer(val); - reducer.init(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<Kokkos::DefaultHostExecutionSpace>, Closure, ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - closure(i, reducer.reference()); + closure(i, value); } - loop_boundaries.thread.team_reduce(reducer); - result = reducer.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } /*template< typename iType, class Space @@ -840,11 +903,23 @@ KOKKOS_INLINE_FUNCTION parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Member>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<Kokkos::DefaultHostExecutionSpace>, Lambda, ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template <typename iType, class Lambda, typename ReducerType, typename Member> @@ -854,11 +929,23 @@ KOKKOS_INLINE_FUNCTION parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Member>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy<Kokkos::DefaultHostExecutionSpace>, ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } //---------------------------------------------------------------------------- @@ -874,7 +961,7 @@ KOKKOS_INLINE_FUNCTION using ClosureValueType = typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure, void>::value_type; - static_assert(std::is_same<ClosureValueType, ValueType>::value, + static_assert(std::is_same_v<ClosureValueType, ValueType>, "Non-matching value types of closure and return type"); ValueType accum = ValueType(); @@ -885,7 +972,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto team_member = loop_boundaries.thread; + auto& team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); @@ -926,7 +1013,7 @@ KOKKOS_INLINE_FUNCTION using ClosureValueType = typename Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::SCAN, void, ClosureType, void>::value_type; - static_assert(std::is_same<ClosureValueType, ValueType>::value, + static_assert(std::is_same_v<ClosureValueType, ValueType>, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp index ab4350f3a7a49d7b0448b32df05cb7555044dd8b..11a93c6bb56b02cd0683f86b68dc1fb4d6b25918 100644 --- a/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp @@ -24,32 +24,6 @@ namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments { - int num_threads; - int num_numa; - int device_id; - int ndevices; - int skip_device; - bool disable_warnings; - bool tune_internals; - bool tool_help = false; - std::string tool_lib = {}; - std::string tool_args = {}; - - KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!") - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, - bool ti = false) - : num_threads{nt}, - num_numa{nn}, - device_id{dv}, - ndevices{-1}, - skip_device{9999}, - disable_warnings{dw}, - tune_internals{ti} {} -}; -#endif - class InitializationSettings { #define KOKKOS_IMPL_DECLARE(TYPE, NAME) \ private: \ @@ -64,12 +38,32 @@ class InitializationSettings { TYPE get_##NAME() const noexcept { return *m_##NAME; } \ static_assert(true, "no-op to require trailing semicolon") +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + private: \ + std::optional<TYPE> m_##NAME; \ + \ + public: \ + KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) { \ + m_##NAME = NAME; \ + return *this; \ + } \ + KOKKOS_DEPRECATED bool has_##NAME() const noexcept { \ + return static_cast<bool>(m_##NAME); \ + } \ + KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { return *m_##NAME; } \ + static_assert(true, "no-op to require trailing semicolon") +#else +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + static_assert(true, "no-op to require trailing semicolon") +#endif + public: KOKKOS_IMPL_DECLARE(int, num_threads); KOKKOS_IMPL_DECLARE(int, device_id); KOKKOS_IMPL_DECLARE(std::string, map_device_id_by); - KOKKOS_IMPL_DECLARE(int, num_devices); // deprecated - KOKKOS_IMPL_DECLARE(int, skip_device); // deprecated + KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices); + KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device); KOKKOS_IMPL_DECLARE(bool, disable_warnings); KOKKOS_IMPL_DECLARE(bool, print_configuration); KOKKOS_IMPL_DECLARE(bool, tune_internals); @@ -80,41 +74,6 @@ class InitializationSettings { #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER #undef KOKKOS_IMPL_DECLARE - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - public: - InitializationSettings() = default; - - InitializationSettings(InitArguments const& old) { - if (old.num_threads != -1) { - set_num_threads(old.num_threads); - } - if (old.device_id != -1) { - set_device_id(old.device_id); - } - if (old.ndevices != -1) { - set_num_devices(old.ndevices); - } - if (old.skip_device != 9999) { - set_skip_device(old.skip_device); - } - if (old.disable_warnings) { - set_disable_warnings(true); - } - if (old.tune_internals) { - set_tune_internals(true); - } - if (old.tool_help) { - set_tools_help(true); - } - if (!old.tool_lib.empty()) { - set_tools_libs(old.tool_lib); - } - if (!old.tool_args.empty()) { - set_tools_args(old.tool_args); - } - } -#endif }; } // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp b/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp index da7b24352d9552e6e3adbcc03b15b3446de0f2ff..69beeb79e564cdf0da2cf5ad5c0a56c48ed64f82 100644 --- a/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp @@ -123,11 +123,11 @@ class LockBasedLIFO : private LockBasedLIFOCommon<T> { using intrusive_node_base_type = SimpleSinglyLinkedListNode<>; public: - LockBasedLIFO() = default; - LockBasedLIFO(LockBasedLIFO const&) = delete; - LockBasedLIFO(LockBasedLIFO&&) = delete; + LockBasedLIFO() = default; + LockBasedLIFO(LockBasedLIFO const&) = delete; + LockBasedLIFO(LockBasedLIFO&&) = delete; LockBasedLIFO& operator=(LockBasedLIFO const&) = delete; - LockBasedLIFO& operator=(LockBasedLIFO&&) = delete; + LockBasedLIFO& operator=(LockBasedLIFO&&) = delete; ~LockBasedLIFO() = default; diff --git a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp index 99f3d1e4fc8964319bd9cd6164f2373b8e9f8e00..d84967501492b092b7d84558498612225578c0ec 100644 --- a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp @@ -43,7 +43,7 @@ template <uintptr_t NotEnqueuedValue = 0, struct SimpleSinglyLinkedListNode { private: using pointer_type = - typename PointerTemplate<SimpleSinglyLinkedListNode>::type; + typename PointerTemplate<SimpleSinglyLinkedListNode>::type; // NOLINT pointer_type m_next = reinterpret_cast<pointer_type>(NotEnqueuedValue); diff --git a/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp b/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp deleted file mode 100644 index 69cb9ccd049664d633181f6aaaf54c121950af5c..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp +++ /dev/null @@ -1,103 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -// Experimental unified task-data parallel manycore LDRD - -#ifndef KOKKOS_IMPL_MEMORYPOOLALLOCATOR_HPP -#define KOKKOS_IMPL_MEMORYPOOLALLOCATOR_HPP - -#include <Kokkos_Macros.hpp> - -#include <Kokkos_Core_fwd.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <class MemoryPool, class T> -class MemoryPoolAllocator { - public: - using memory_pool = MemoryPool; - - private: - memory_pool m_pool; - - public: - KOKKOS_DEFAULTED_FUNCTION - MemoryPoolAllocator() = default; - KOKKOS_DEFAULTED_FUNCTION - MemoryPoolAllocator(MemoryPoolAllocator const&) = default; - KOKKOS_DEFAULTED_FUNCTION - MemoryPoolAllocator(MemoryPoolAllocator&&) = default; - KOKKOS_DEFAULTED_FUNCTION - MemoryPoolAllocator& operator=(MemoryPoolAllocator const&) = default; - KOKKOS_DEFAULTED_FUNCTION - MemoryPoolAllocator& operator=(MemoryPoolAllocator&&) = default; - KOKKOS_DEFAULTED_FUNCTION - ~MemoryPoolAllocator() = default; - - KOKKOS_INLINE_FUNCTION - explicit MemoryPoolAllocator(memory_pool const& arg_pool) - : m_pool(arg_pool) {} - KOKKOS_INLINE_FUNCTION - explicit MemoryPoolAllocator(memory_pool&& arg_pool) - : m_pool(std::move(arg_pool)) {} - - public: - using value_type = T; - using pointer = T*; - using size_type = typename MemoryPool::memory_space::size_type; - using difference_type = std::make_signed_t<size_type>; - - template <class U> - struct rebind { - using other = MemoryPoolAllocator<MemoryPool, U>; - }; - - KOKKOS_INLINE_FUNCTION - pointer allocate(size_t n) { - void* rv = m_pool.allocate(n * sizeof(T)); - if (rv == nullptr) { - Kokkos::abort("Kokkos MemoryPool allocator failed to allocate memory"); - } - return reinterpret_cast<T*>(rv); - } - - KOKKOS_INLINE_FUNCTION - void deallocate(T* ptr, size_t n) { m_pool.deallocate(ptr, n * sizeof(T)); } - - KOKKOS_INLINE_FUNCTION - size_type max_size() const { return m_pool.max_block_size(); } - - KOKKOS_INLINE_FUNCTION - bool operator==(MemoryPoolAllocator const& other) const { - return m_pool == other.m_pool; - } - - KOKKOS_INLINE_FUNCTION - bool operator!=(MemoryPoolAllocator const& other) const { - return !(*this == other); - } -}; - -} // end namespace Impl -} // end namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_IMPL_MEMORYPOOLALLOCATOR_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp deleted file mode 100644 index 2f0e01c5b28da1c4db310f7b4f6f095bd5626a91..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.cpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include <impl/Kokkos_MemorySpace.hpp> - -#include <iostream> -#include <string> -#include <sstream> - -namespace Kokkos { -namespace Impl { - -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - auto generate_failure_message = [&](std::ostream& o) { - o << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space_name - << "\" failed with the following error: "; - failure.print_error_message(o); - if (failure.failure_mode() == - Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned) { - // TODO: delete the misaligned memory? - o << "Warning: Allocation failed due to misalignment; memory may " - "be leaked.\n"; - } - o.flush(); - }; - try { - std::ostringstream sstr; - generate_failure_message(sstr); - Kokkos::Impl::throw_runtime_exception(sstr.str()); - } catch (std::bad_alloc const&) { - // Probably failed to allocate the string because we're so close to out - // of memory. Try printing to std::cerr instead - try { - generate_failure_message(std::cerr); - } catch (std::bad_alloc const&) { - // oh well, we tried... - } - Kokkos::Impl::throw_runtime_exception( - "Kokkos encountered an allocation failure, then another allocation " - "failure while trying to create the error message."); - } -} - -} // end namespace Impl -} // end namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp deleted file mode 100644 index 44956dd7c5d904af38bf3d64d32e15a4f6e9946d..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.hpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP -#define KOKKOS_IMPL_MEMORYSPACE_HPP - -#include <Kokkos_Macros.hpp> -#include <impl/Kokkos_SharedAlloc.hpp> -#include <impl/Kokkos_Error.hpp> - -#include <string> - -namespace Kokkos { -namespace Impl { - -// Defined in implementation file to avoid having to include iostream -void safe_throw_allocation_with_header_failure( - std::string const &space_name, std::string const &label, - Kokkos::Experimental::RawMemoryAllocationFailure const &failure); - -template <class MemorySpace> -SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast<SharedAllocationHeader *>(space.allocate( - label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -template <class ExecutionSpace, class MemorySpace> -SharedAllocationHeader *checked_allocation_with_header( - ExecutionSpace const &exec_space, MemorySpace const &space, - std::string const &label, size_t alloc_size) { - try { - return reinterpret_cast<SharedAllocationHeader *>(space.allocate( - exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_MEMORYSPACE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp deleted file mode 100644 index 42a53b04fb2a940ae466dd1aa90bef90ad6c42b1..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <Kokkos_Macros.hpp> -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP) -#define KOKKOS_MEMORY_FENCE_HPP -namespace Kokkos { - -////////////////////////////////////////////////////// -// store_fence() -// -// If possible use a store fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void store_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("sfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -////////////////////////////////////////////////////// -// load_fence() -// -// If possible use a load fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void load_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("lfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -} // namespace Kokkos - -#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp index 6895f4271f6a8614eddfc8f1b906cf11aea872be..1046bae5abe30ed3ad90d5475a775c4a5131fcbf 100644 --- a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp @@ -40,6 +40,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -94,8 +99,8 @@ struct MultipleTaskQueueTeamEntry { template <class _always_void = void> KOKKOS_INLINE_FUNCTION OptionalRef<task_base_type> _pop_failed_insertion( int priority, TaskType type, - std::enable_if_t<task_queue_traits::ready_queue_insertion_may_fail && - std::is_void<_always_void>::value, + std::enable_if_t<std::is_void_v<_always_void> && + task_queue_traits::ready_queue_insertion_may_fail, void*> = nullptr) { auto* rv_ptr = m_failed_heads[priority][(int)type]; if (rv_ptr) { @@ -112,8 +117,8 @@ struct MultipleTaskQueueTeamEntry { template <class _always_void = void> KOKKOS_INLINE_FUNCTION OptionalRef<task_base_type> _pop_failed_insertion( int /*priority*/, TaskType /*type*/, - std::enable_if_t<!task_queue_traits::ready_queue_insertion_may_fail && - std::is_void<_always_void>::value, + std::enable_if_t<std::is_void_v<_always_void> && + !task_queue_traits::ready_queue_insertion_may_fail, void*> = nullptr) { return OptionalRef<task_base_type>{nullptr}; } @@ -170,8 +175,8 @@ struct MultipleTaskQueueTeamEntry { template <class _always_void = void> KOKKOS_INLINE_FUNCTION void do_handle_failed_insertion( runnable_task_base_type&& task, - std::enable_if_t<task_queue_traits::ready_queue_insertion_may_fail && - std::is_void<_always_void>::value, + std::enable_if_t<std::is_void_v<_always_void> && + task_queue_traits::ready_queue_insertion_may_fail, void*> = nullptr) { // failed insertions, if they happen, must be from the only thread that // is allowed to push to m_ready_queues, so this linked-list insertion is @@ -185,8 +190,8 @@ struct MultipleTaskQueueTeamEntry { template <class _always_void = void> KOKKOS_INLINE_FUNCTION void do_handle_failed_insertion( runnable_task_base_type&& /*task*/, - std::enable_if_t<!task_queue_traits::ready_queue_insertion_may_fail && - std::is_void<_always_void>::value, + std::enable_if_t<std::is_void_v<_always_void> && + !task_queue_traits::ready_queue_insertion_may_fail, void*> = nullptr) { Kokkos::abort("should be unreachable!"); } @@ -194,11 +199,9 @@ struct MultipleTaskQueueTeamEntry { template <class _always_void = void> KOKKOS_INLINE_FUNCTION void flush_failed_insertions( int priority, int task_type, - std::enable_if_t< - task_queue_traits::ready_queue_insertion_may_fail && - std::is_void<_always_void>::value, // just to make this dependent - // on template parameter - int> = 0) { + std::enable_if_t<std::is_void_v<_always_void> && + task_queue_traits::ready_queue_insertion_may_fail, + int> = 0) { // TODO @tasking @minor DSH this somethimes gets some things out of LIFO // order, which may be undesirable (but not a bug) @@ -223,11 +226,9 @@ struct MultipleTaskQueueTeamEntry { template <class _always_void = void> KOKKOS_INLINE_FUNCTION void flush_failed_insertions( int, int, - std::enable_if_t< - !task_queue_traits::ready_queue_insertion_may_fail && - std::is_void<_always_void>::value, // just to make this dependent - // on template parameter - int> = 0) {} + std::enable_if_t<std::is_void_v<_always_void> && + !task_queue_traits::ready_queue_insertion_may_fail, + int> = 0) {} KOKKOS_INLINE_FUNCTION void flush_all_failed_insertions() { @@ -341,8 +342,8 @@ class MultipleTaskQueue final static constexpr int NumPriorities = 3; KOKKOS_INLINE_FUNCTION - constexpr typename vla_emulation_base_t::vla_entry_count_type n_queues() const - noexcept { + constexpr typename vla_emulation_base_t::vla_entry_count_type n_queues() + const noexcept { return this->n_vla_entries(); } @@ -350,11 +351,11 @@ class MultipleTaskQueue final //---------------------------------------------------------------------------- // <editor-fold desc="Constructors, destructors, and assignment"> {{{2 - MultipleTaskQueue() = delete; - MultipleTaskQueue(MultipleTaskQueue const&) = delete; - MultipleTaskQueue(MultipleTaskQueue&&) = delete; + MultipleTaskQueue() = delete; + MultipleTaskQueue(MultipleTaskQueue const&) = delete; + MultipleTaskQueue(MultipleTaskQueue&&) = delete; MultipleTaskQueue& operator=(MultipleTaskQueue const&) = delete; - MultipleTaskQueue& operator=(MultipleTaskQueue&&) = delete; + MultipleTaskQueue& operator=(MultipleTaskQueue&&) = delete; MultipleTaskQueue(typename base_t::execution_space const& arg_execution_space, typename base_t::memory_space const&, @@ -424,8 +425,8 @@ class MultipleTaskQueue final // TODO @tasking @generalization DSH make this a property-based customization // point KOKKOS_INLINE_FUNCTION - team_scheduler_info_type initial_team_scheduler_info(int rank_in_league) const - noexcept { + team_scheduler_info_type initial_team_scheduler_info( + int rank_in_league) const noexcept { return team_scheduler_info_type{ typename team_scheduler_info_type::team_queue_id_t(rank_in_league % n_queues())}; @@ -494,6 +495,10 @@ class MultipleTaskQueue final } /* namespace Impl */ } /* namespace Kokkos */ +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp index bc6197753c32d3eb7e569f21c996fa26b23c2166..76b079b6b91ecdea6f9c83d8f49c4fa3ce6775ce 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp @@ -78,8 +78,6 @@ namespace Tools { const std::string InitArguments::unset_string_option = { "kokkos_tools_impl_unset_option"}; -InitArguments tool_arguments; - namespace Impl { void parse_command_line_arguments(int& argc, char* argv[], InitArguments& arguments) { @@ -971,84 +969,6 @@ void set_callbacks(Kokkos::Tools::Experimental::EventSet new_events) { } // namespace Experimental } // namespace Tools -namespace Profiling { -bool profileLibraryLoaded() { return Kokkos::Tools::profileLibraryLoaded(); } - -void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID) { - Kokkos::Tools::beginParallelFor(kernelPrefix, devID, kernelID); -} -void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID) { - Kokkos::Tools::beginParallelReduce(kernelPrefix, devID, kernelID); -} -void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID) { - Kokkos::Tools::beginParallelScan(kernelPrefix, devID, kernelID); -} -void endParallelFor(const uint64_t kernelID) { - Kokkos::Tools::endParallelFor(kernelID); -} -void endParallelReduce(const uint64_t kernelID) { - Kokkos::Tools::endParallelReduce(kernelID); -} -void endParallelScan(const uint64_t kernelID) { - Kokkos::Tools::endParallelScan(kernelID); -} - -void pushRegion(const std::string& kName) { Kokkos::Tools::pushRegion(kName); } -void popRegion() { Kokkos::Tools::popRegion(); } - -void createProfileSection(const std::string& sectionName, uint32_t* secID) { - Kokkos::Tools::createProfileSection(sectionName, secID); -} -void destroyProfileSection(const uint32_t secID) { - Kokkos::Tools::destroyProfileSection(secID); -} - -void startSection(const uint32_t secID) { Kokkos::Tools::startSection(secID); } - -void stopSection(const uint32_t secID) { Kokkos::Tools::stopSection(secID); } - -void markEvent(const std::string& eventName) { - Kokkos::Tools::markEvent(eventName); -} -void allocateData(const SpaceHandle handle, const std::string name, - const void* data, const uint64_t size) { - Kokkos::Tools::allocateData(handle, name, data, size); -} -void deallocateData(const SpaceHandle space, const std::string label, - const void* ptr, const uint64_t size) { - Kokkos::Tools::deallocateData(space, label, ptr, size); -} - -void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, - const void* dst_ptr, const SpaceHandle src_space, - const std::string src_label, const void* src_ptr, - const uint64_t size) { - Kokkos::Tools::beginDeepCopy(dst_space, dst_label, dst_ptr, src_space, - src_label, src_ptr, size); -} -void endDeepCopy() { Kokkos::Tools::endDeepCopy(); } - -void finalize() { Kokkos::Tools::finalize(); } -void initialize(const std::string& profileLibrary) { - Kokkos::Tools::initialize(profileLibrary); -} - -bool printHelp(const std::string& args) { - return Kokkos::Tools::printHelp(args); -} -void parseArgs(const std::string& args) { Kokkos::Tools::parseArgs(args); } -void parseArgs(int _argc, char** _argv) { - Kokkos::Tools::parseArgs(_argc, _argv); -} - -SpaceHandle make_space_handle(const char* space_name) { - return Kokkos::Tools::make_space_handle(space_name); -} -} // namespace Profiling - // Tuning namespace Tools { diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp index 025d8d3d18e69c27dec7dd2e569bfcf530e48dc8..4b8ad94a131b289991ae99918a5583828f3844d7 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp @@ -17,6 +17,15 @@ #ifndef KOKKOS_IMPL_KOKKOS_PROFILING_HPP #define KOKKOS_IMPL_KOKKOS_PROFILING_HPP +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING +#endif + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_Macros.hpp> +#include <Kokkos_Tuners.hpp> #include <impl/Kokkos_Profiling_Interface.hpp> #include <memory> #include <iosfwd> @@ -64,6 +73,11 @@ void parse_command_line_arguments(int& narg, char* arg[], Kokkos::Tools::Impl::InitializationStatus parse_environment_variables( InitArguments& arguments); +template <typename PolicyType, typename Functor> +struct ToolResponse { + PolicyType policy; +}; + } // namespace Impl bool profileLibraryLoaded(); @@ -187,15 +201,6 @@ void profile_fence_event(const std::string& name, DirectFenceIDHandle devIDTag, Kokkos::Tools::endFence(handle); } -inline uint32_t int_for_synchronization_reason( - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) { - switch (reason) { - case GlobalDeviceSynchronization: return 0; - case DeepCopyResourceSynchronization: return 0x00ffffff; - } - return 0; -} - template <typename Space, typename FencingFunctor> void profile_fence_event( const std::string& name, @@ -260,43 +265,46 @@ size_t get_new_context_id(); size_t get_current_context_id(); } // namespace Experimental +namespace Impl {} // namespace Impl + } // namespace Tools namespace Profiling { -bool profileLibraryLoaded(); +// don't let ClangFormat reorder the using-declarations below +// clang-format off +using Kokkos::Tools::profileLibraryLoaded; -void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID); -void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID); -void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, - uint64_t* kernelID); -void endParallelFor(const uint64_t kernelID); -void endParallelReduce(const uint64_t kernelID); -void endParallelScan(const uint64_t kernelID); -void pushRegion(const std::string& kName); -void popRegion(); +using Kokkos::Tools::printHelp; +using Kokkos::Tools::parseArgs; -void createProfileSection(const std::string& sectionName, uint32_t* secID); -void destroyProfileSection(const uint32_t secID); -void startSection(const uint32_t secID); +using Kokkos::Tools::initialize; +using Kokkos::Tools::finalize; -void stopSection(const uint32_t secID); +using Kokkos::Tools::beginParallelFor; +using Kokkos::Tools::beginParallelReduce; +using Kokkos::Tools::beginParallelScan; +using Kokkos::Tools::endParallelFor; +using Kokkos::Tools::endParallelReduce; +using Kokkos::Tools::endParallelScan; -void markEvent(const std::string& eventName); -void allocateData(const SpaceHandle handle, const std::string name, - const void* data, const uint64_t size); -void deallocateData(const SpaceHandle space, const std::string label, - const void* ptr, const uint64_t size); -void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, - const void* dst_ptr, const SpaceHandle src_space, - const std::string src_label, const void* src_ptr, - const uint64_t size); -void endDeepCopy(); -void finalize(); -void initialize(const std::string& = {}); +using Kokkos::Tools::allocateData; +using Kokkos::Tools::deallocateData; + +using Kokkos::Tools::beginDeepCopy; +using Kokkos::Tools::endDeepCopy; + +using Kokkos::Tools::pushRegion; +using Kokkos::Tools::popRegion; -SpaceHandle make_space_handle(const char* space_name); +using Kokkos::Tools::createProfileSection; +using Kokkos::Tools::destroyProfileSection; +using Kokkos::Tools::startSection; +using Kokkos::Tools::stopSection; + +using Kokkos::Tools::markEvent; + +using Kokkos::Tools::make_space_handle; +// clang-format on namespace Experimental { using Kokkos::Tools::Experimental::set_allocate_data_callback; @@ -374,4 +382,9 @@ size_t get_new_variable_id(); } // namespace Kokkos +#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING +#undef KOKKOS_IMPL_PUBLIC_INCLUDE +#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_PROFILING +#endif + #endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h index 731a11e917ad758759938531e52fddf7efd30533..80f63428a15579217c96e28962c7a522a999d247 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -28,10 +28,14 @@ #include <stdbool.h> #endif -#define KOKKOSP_INTERFACE_VERSION 20211015 +#define KOKKOSP_INTERFACE_VERSION 20240906 // Profiling +#ifdef __cplusplus +extern "C" { +#endif + struct Kokkos_Profiling_KokkosPDeviceInfo { size_t deviceID; }; @@ -154,7 +158,7 @@ enum Kokkos_Tools_OptimizationType { Kokkos_Tools_Maximize }; -struct Kokkos_Tools_OptimzationGoal { +struct Kokkos_Tools_OptimizationGoal { size_t type_id; enum Kokkos_Tools_OptimizationType goal; }; @@ -220,7 +224,7 @@ typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t); typedef void (*Kokkos_Tools_contextEndFunction)( const size_t, struct Kokkos_Tools_VariableValue); typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)( - const size_t, const struct Kokkos_Tools_OptimzationGoal goal); + const size_t, const struct Kokkos_Tools_OptimizationGoal goal); struct Kokkos_Profiling_EventSet { Kokkos_Profiling_initFunction init; @@ -267,4 +271,8 @@ struct Kokkos_Profiling_EventSet { // changing struct layout }; +#ifdef __cplusplus +} +#endif + #endif // KOKKOS_PROFILING_C_INTERFACE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index af71932e47beb57e8e4ed312e262ae8b8dc03597..ddd6223be1c506632768ae496d61134cef2e82ff 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -101,6 +101,15 @@ inline uint32_t device_id(ExecutionSpace const& space) noexcept { << num_instance_bits) + space.impl_instance_id(); } + +inline uint32_t int_for_synchronization_reason( + Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) { + switch (reason) { + case GlobalDeviceSynchronization: return 0; + case DeepCopyResourceSynchronization: return 0x00ffffff; + } + return 0; +} } // namespace Experimental } // namespace Tools } // end namespace Kokkos @@ -226,7 +235,7 @@ using ValueType = Kokkos_Tools_VariableInfo_ValueType; using CandidateValueType = Kokkos_Tools_VariableInfo_CandidateValueType; using SetOrRange = Kokkos_Tools_VariableInfo_SetOrRange; using VariableInfo = Kokkos_Tools_VariableInfo; -using OptimizationGoal = Kokkos_Tools_OptimzationGoal; +using OptimizationGoal = Kokkos_Tools_OptimizationGoal; using TuningString = Kokkos_Tools_Tuning_String; using VariableValue = Kokkos_Tools_VariableValue; diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index 255f5125f4abf9d81c96eb6a5a102868a510dc15..ef8c2831848603c0972265f4aed63e42a5ad439e 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -20,6 +20,8 @@ #include <Kokkos_Core.hpp> #include <iomanip> +#include <iostream> +#include <sstream> namespace Kokkos { namespace Impl { @@ -84,7 +86,7 @@ bool SharedAllocationRecord<void, void>::is_sane( } if (nullptr != Kokkos::atomic_exchange(&root->m_next, root_next)) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord failed is_sane unlocking"); } } @@ -95,12 +97,12 @@ bool SharedAllocationRecord<void, void>::is_sane( bool SharedAllocationRecord<void, void>::is_sane( SharedAllocationRecord<void, void>*) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord::is_sane only works with " "KOKKOS_ENABLE_DEBUG enabled"); return false; } -#endif //#ifdef KOKKOS_ENABLE_DEBUG +#endif // #ifdef KOKKOS_ENABLE_DEBUG #ifdef KOKKOS_ENABLE_DEBUG SharedAllocationRecord<void, void>* SharedAllocationRecord<void, void>::find( @@ -127,7 +129,7 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<void, void>::find( } if (nullptr != Kokkos::atomic_exchange(&arg_root->m_next, root_next)) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord failed locking/unlocking"); } return r; @@ -135,10 +137,9 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<void, void>::find( #else SharedAllocationRecord<void, void>* SharedAllocationRecord<void, void>::find( SharedAllocationRecord<void, void>* const, void* const) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord::find only works with " - "KOKKOS_ENABLE_DEBUG " - "enabled"); + "KOKKOS_ENABLE_DEBUG enabled"); return nullptr; } #endif @@ -186,13 +187,13 @@ SharedAllocationRecord<void, void>::SharedAllocationRecord( Kokkos::memory_fence(); if (nullptr != Kokkos::atomic_exchange(&m_root->m_next, this)) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord failed locking/unlocking"); } #endif } else { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord given nullptr allocation"); } } @@ -202,8 +203,7 @@ void SharedAllocationRecord<void, void>::increment( const int old_count = Kokkos::atomic_fetch_add(&arg_record->m_count, 1); if (old_count < 0) { // Error - Kokkos::Impl::throw_runtime_exception( - "Kokkos::Impl::SharedAllocationRecord failed increment"); + Kokkos::abort("Kokkos::Impl::SharedAllocationRecord failed increment"); } } @@ -217,8 +217,7 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord< ss << "Kokkos allocation \""; ss << arg_record->get_label(); ss << "\" is being deallocated after Kokkos::finalize was called\n"; - auto s = ss.str(); - Kokkos::Impl::throw_runtime_exception(s); + Kokkos::abort(ss.str().c_str()); } #ifdef KOKKOS_ENABLE_DEBUG @@ -254,7 +253,7 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord< // Unlock the list: if (nullptr != Kokkos::atomic_exchange(&arg_record->m_root->m_next, root_next)) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord failed decrement unlocking"); } @@ -271,7 +270,7 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord< "= %d\n", arg_record->m_alloc_ptr->m_label, old_count); fflush(stderr); - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord failed decrement count"); } @@ -315,11 +314,24 @@ void SharedAllocationRecord<void, void>::print_host_accessible_records( void SharedAllocationRecord<void, void>::print_host_accessible_records( std::ostream&, const char* const, const SharedAllocationRecord* const, const bool) { - Kokkos::Impl::throw_runtime_exception( + Kokkos::abort( "Kokkos::Impl::SharedAllocationRecord::print_host_accessible_records" " only works with KOKKOS_ENABLE_DEBUG enabled"); } #endif +void fill_host_accessible_header_info( + SharedAllocationRecord<void, void>* arg_record, + SharedAllocationHeader& arg_header, std::string const& arg_label) { + // Fill in the Header information, directly accessible on the host + + arg_header.m_record = arg_record; + + strncpy(arg_header.m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length); + // Set last element zero, in case c_str is too long + arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + } /* namespace Impl */ } /* namespace Kokkos */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 043505a158e99a8dbd5f82a598194d9ca6562433..e0c5b0d7a066de3fe8b05cc119954329ce9c4092 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -51,6 +51,9 @@ class SharedAllocationHeader { friend class SharedAllocationRecordCommon; template <class> friend class HostInaccessibleSharedAllocationRecordCommon; + friend void fill_host_accessible_header_info( + SharedAllocationRecord<void, void>*, SharedAllocationHeader&, + std::string const&); Record* m_record; char m_label[maximum_label_length]; @@ -98,9 +101,9 @@ class SharedAllocationRecord<void, void> { int m_count; std::string m_label; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; + SharedAllocationRecord(SharedAllocationRecord&&) = delete; + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; /**\brief Construct and insert into 'arg_root' tracking set. @@ -145,25 +148,23 @@ class SharedAllocationRecord<void, void> { SharedAllocationRecord() : m_alloc_ptr(nullptr), m_alloc_size(0), - m_dealloc(nullptr) + m_dealloc(nullptr), #ifdef KOKKOS_ENABLE_DEBUG - , m_root(this), m_prev(this), - m_next(this) + m_next(this), #endif - , m_count(0) { } static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION const SharedAllocationHeader* head() const { return m_alloc_ptr; } /* User's memory begins at the end of the header */ - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION void* data() const { return static_cast<void*>(m_alloc_ptr + 1); } /* User's memory begins at the end of the header */ @@ -195,23 +196,64 @@ class SharedAllocationRecord<void, void> { const SharedAllocationRecord* const root, const bool detail); }; +template <class MemorySpace> +SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space, + std::string const& label, + size_t alloc_size) { + return reinterpret_cast<SharedAllocationHeader*>(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), alloc_size)); +} + +template <class ExecutionSpace, class MemorySpace> +SharedAllocationHeader* checked_allocation_with_header( + ExecutionSpace const& exec_space, MemorySpace const& space, + std::string const& label, size_t alloc_size) { + return reinterpret_cast<SharedAllocationHeader*>( + space.allocate(exec_space, label.c_str(), + alloc_size + sizeof(SharedAllocationHeader), alloc_size)); +} + +void fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label); + template <class MemorySpace> class SharedAllocationRecordCommon : public SharedAllocationRecord<void, void> { private: using derived_t = SharedAllocationRecord<MemorySpace, void>; using record_base_t = SharedAllocationRecord<void, void>; - derived_t& self() { return *static_cast<derived_t*>(this); } - derived_t const& self() const { return *static_cast<derived_t const*>(this); } protected: using record_base_t::record_base_t; - void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label); + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif static void deallocate(record_base_t* arg_rec); public: + ~SharedAllocationRecordCommon(); + template <class ExecutionSpace> + SharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord<void, void>( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord<void, void>::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); + } + SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + static auto allocate(MemorySpace const& arg_space, std::string const& arg_label, size_t arg_alloc_size) -> derived_t*; @@ -224,34 +266,113 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord<void, void> { /**\brief Deallocate tracked memory in the space */ static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); static auto get_record(void* alloc_ptr) -> derived_t*; - std::string get_label() const; + std::string get_label() const override; static void print_records(std::ostream& s, MemorySpace const&, bool detail = false); }; template <class MemorySpace> class HostInaccessibleSharedAllocationRecordCommon - : public SharedAllocationRecordCommon<MemorySpace> { + : public SharedAllocationRecord<void, void> { private: - using base_t = SharedAllocationRecordCommon<MemorySpace>; using derived_t = SharedAllocationRecord<MemorySpace, void>; using record_base_t = SharedAllocationRecord<void, void>; protected: - using base_t::base_t; + using record_base_t::record_base_t; + + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif + + static void deallocate(record_base_t* arg_rec); public: + ~HostInaccessibleSharedAllocationRecordCommon(); + template <class ExecutionSpace> + HostInaccessibleSharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord<void, void>( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + Kokkos::Impl::DeepCopy<MemorySpace, HostSpace>( + exec, SharedAllocationRecord<void, void>::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + } + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + + static auto allocate(MemorySpace const& arg_space, + std::string const& arg_label, size_t arg_alloc_size) + -> derived_t*; + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(MemorySpace const& arg_space, + std::string const& arg_alloc_label, + size_t arg_alloc_size); + /**\brief Reallocate tracked memory in the space */ + static void deallocate_tracked(void* arg_alloc_ptr); + /**\brief Deallocate tracked memory in the space */ + static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); + static void print_records(std::ostream& s, MemorySpace const&, bool detail = false); static auto get_record(void* alloc_ptr) -> derived_t*; - std::string get_label() const; + std::string get_label() const override; }; -namespace { +#ifdef KOKKOS_ENABLE_DEBUG +template <class MemorySpace> +SharedAllocationRecord<void, void> + SharedAllocationRecordCommon<MemorySpace>::s_root_record; + +template <class MemorySpace> +SharedAllocationRecord<void, void> + HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::s_root_record; +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord<MEMORY_SPACE, void> \ + : public Kokkos::Impl::SharedAllocationRecordCommon<MEMORY_SPACE> { \ + using SharedAllocationRecordCommon< \ + MEMORY_SPACE>::SharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( \ + MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord<MEMORY_SPACE, void> \ + : public Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> { \ + using HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE>::HostInaccessibleSharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::SharedAllocationRecordCommon<MEMORY_SPACE> + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> /* Taking the address of this function so make sure it is unique */ template <class MemorySpace, class DestroyFunctor> -void deallocate(SharedAllocationRecord<void, void>* record_ptr) { +inline void deallocate(SharedAllocationRecord<void, void>* record_ptr) { using base_type = SharedAllocationRecord<MemorySpace, void>; using this_type = SharedAllocationRecord<MemorySpace, DestroyFunctor>; @@ -263,8 +384,6 @@ void deallocate(SharedAllocationRecord<void, void>* record_ptr) { delete ptr; } -} // namespace - /* * Memory space specialization of SharedAllocationRecord< Space , void > * requires : @@ -299,8 +418,8 @@ class SharedAllocationRecord &Kokkos::Impl::deallocate<MemorySpace, DestroyFunctor>), m_destroy() {} - SharedAllocationRecord() = delete; - SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord() = delete; + SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; public: @@ -349,15 +468,21 @@ union SharedAllocationTracker { // pressure on compiler optimization by reducing // number of symbols and inline functions. -#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT \ - KOKKOS_IF_ON_HOST((if (!(m_record_bits & DO_NOT_DEREF_FLAG)) { \ - Record::increment(m_record); \ - })) +#ifdef KOKKOS_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY +#define KOKKOS_IMPL_BRANCH_PROB KOKKOS_IMPL_ATTRIBUTE_UNLIKELY +#else +#define KOKKOS_IMPL_BRANCH_PROB +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT \ + KOKKOS_IF_ON_HOST( \ + (if (!(m_record_bits & DO_NOT_DEREF_FLAG)) \ + KOKKOS_IMPL_BRANCH_PROB { Record::increment(m_record); })) -#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT \ - KOKKOS_IF_ON_HOST((if (!(m_record_bits & DO_NOT_DEREF_FLAG)) { \ - Record::decrement(m_record); \ - })) +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT \ + KOKKOS_IF_ON_HOST( \ + (if (!(m_record_bits & DO_NOT_DEREF_FLAG)) \ + KOKKOS_IMPL_BRANCH_PROB { Record::decrement(m_record); })) #define KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, \ override_tracking) \ @@ -376,8 +501,8 @@ union SharedAllocationTracker { } template <class MemorySpace> - constexpr SharedAllocationRecord<MemorySpace, void>* get_record() const - noexcept { + constexpr SharedAllocationRecord<MemorySpace, void>* get_record() + const noexcept { return (m_record_bits & DO_NOT_DEREF_FLAG) ? nullptr : static_cast<SharedAllocationRecord<MemorySpace, void>*>( @@ -504,8 +629,41 @@ union SharedAllocationTracker { #undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT #undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT +#undef KOKKOS_IMPL_BRANCH_PROB }; +struct SharedAllocationDisableTrackingGuard { + SharedAllocationDisableTrackingGuard() { + KOKKOS_ASSERT( + (Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enabled())); + Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_disable(); + } + + SharedAllocationDisableTrackingGuard( + const SharedAllocationDisableTrackingGuard&) = delete; + SharedAllocationDisableTrackingGuard(SharedAllocationDisableTrackingGuard&&) = + delete; + + ~SharedAllocationDisableTrackingGuard() { + KOKKOS_ASSERT(( + !Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enabled())); + Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enable(); + } + // clang-format off + // The old version of clang format we use is particularly egregious here + SharedAllocationDisableTrackingGuard& operator=( + const SharedAllocationDisableTrackingGuard&) = delete; + SharedAllocationDisableTrackingGuard& operator=( + SharedAllocationDisableTrackingGuard&&) = delete; + // clang-format on +}; + +template <class FunctorType, class... Args> +inline FunctorType construct_with_shared_allocation_tracking_disabled( + Args&&... args) { + [[maybe_unused]] auto guard = SharedAllocationDisableTrackingGuard{}; + return {std::forward<Args>(args)...}; +} } /* namespace Impl */ } /* namespace Kokkos */ #endif diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index d403ef9db064c0d99c63ce03d3150da22de720a9..41036ab06788c0d2cd5028b3a487fe32148bfc10 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -31,6 +31,66 @@ namespace Kokkos { namespace Impl { +template <class MemorySpace> +SharedAllocationRecordCommon<MemorySpace>::~SharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord<void, void>::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; + auto label = SharedAllocationRecord<void, void>::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} +template <class MemorySpace> +HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::~HostInaccessibleSharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord<void, void>::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; + auto label = SharedAllocationRecord<void, void>::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} + +template <class MemorySpace> +SharedAllocationRecordCommon<MemorySpace>::SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t alloc_size, + SharedAllocationRecord<void, void>::function_type dealloc) + : SharedAllocationRecord<void, void>( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord<void, void>::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); +} + +template <class MemorySpace> +HostInaccessibleSharedAllocationRecordCommon<MemorySpace>:: + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, + std::size_t alloc_size, + SharedAllocationRecord<void, void>::function_type dealloc) + : SharedAllocationRecord<void, void>( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + typename MemorySpace::execution_space exec; + Kokkos::Impl::DeepCopy<MemorySpace, HostSpace>( + exec, SharedAllocationRecord<void, void>::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + exec.fence(std::string("SharedAllocationRecord<Kokkos::") + + MemorySpace::name() + + "Space, void>::SharedAllocationRecord(): " + "fence after copying header from HostSpace"); +} + template <class MemorySpace> auto SharedAllocationRecordCommon<MemorySpace>::allocate( MemorySpace const& arg_space, std::string const& arg_label, @@ -76,9 +136,64 @@ void* SharedAllocationRecordCommon<MemorySpace>::reallocate_tracked( Kokkos::Impl::DeepCopy<MemorySpace, MemorySpace>( r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord<Kokkos::Experimental::HBWSpace, " - "void>::reallocate_tracked(): fence after copying data"); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); + + record_base_t::increment(r_new); + record_base_t::decrement(r_old); + + return r_new->data(); +} + +template <class MemorySpace> +auto HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::allocate( + MemorySpace const& arg_space, std::string const& arg_label, + size_t arg_alloc_size) -> derived_t* { + return new derived_t(arg_space, arg_label, arg_alloc_size); +} + +template <class MemorySpace> +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::allocate_tracked(const MemorySpace& arg_space, + const std::string& arg_alloc_label, + size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + record_base_t::increment(r); + + return r->data(); +} + +template <class MemorySpace> +void HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::deallocate( + HostInaccessibleSharedAllocationRecordCommon::record_base_t* arg_rec) { + delete static_cast<derived_t*>(arg_rec); +} + +template <class MemorySpace> +void HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::deallocate_tracked(void* arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr); + record_base_t::decrement(r); + } +} + +template <class MemorySpace> +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::reallocate_tracked(void* arg_alloc_ptr, + size_t arg_alloc_size) { + derived_t* const r_old = derived_t::get_record(arg_alloc_ptr); + derived_t* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy<MemorySpace, MemorySpace>( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); record_base_t::increment(r_new); record_base_t::decrement(r_old); @@ -108,20 +223,6 @@ std::string SharedAllocationRecordCommon<MemorySpace>::get_label() const { return record_base_t::m_label; } -template <class MemorySpace> -void SharedAllocationRecordCommon<MemorySpace>:: - _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label) { - // Fill in the Header information, directly accessible on the host - - arg_header.m_record = &self(); - - strncpy(arg_header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - template <class MemorySpace> void SharedAllocationRecordCommon<MemorySpace>::print_records( std::ostream& s, const MemorySpace&, bool detail) { diff --git a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp index 268aaa5bd6eb7bc8ee82cbde1c6e90990d60f0a3..b985efc48a3b7e1fbe92c5c765cb80aed4796da3 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp @@ -40,6 +40,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -458,6 +463,10 @@ inline void wait(SimpleTaskScheduler<ExecSpace, QueueType> const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------#endif ///* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp index 12ce75ea68dba09df192dd4aae1b71f4f69a80a4..934013d67b9e001c91ccaf19528bd6c29659894c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp @@ -90,11 +90,11 @@ class SingleTaskQueue //---------------------------------------------------------------------------- // <editor-fold desc="Constructors, destructors, and assignment"> {{{2 - SingleTaskQueue() = delete; - SingleTaskQueue(SingleTaskQueue const&) = delete; - SingleTaskQueue(SingleTaskQueue&&) = delete; + SingleTaskQueue() = delete; + SingleTaskQueue(SingleTaskQueue const&) = delete; + SingleTaskQueue(SingleTaskQueue&&) = delete; SingleTaskQueue& operator=(SingleTaskQueue const&) = delete; - SingleTaskQueue& operator=(SingleTaskQueue&&) = delete; + SingleTaskQueue& operator=(SingleTaskQueue&&) = delete; explicit SingleTaskQueue(typename base_t::execution_space const&, typename base_t::memory_space const&, @@ -142,8 +142,8 @@ class SingleTaskQueue } KOKKOS_INLINE_FUNCTION - constexpr team_scheduler_info_type initial_team_scheduler_info(int) const - noexcept { + constexpr team_scheduler_info_type initial_team_scheduler_info( + int) const noexcept { return {}; } }; diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp deleted file mode 100644 index c57b17d646a2493cf11e94f8a396abe3ea54c256..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_SPINWAIT_HPP -#define KOKKOS_SPINWAIT_HPP - -#include <Kokkos_Macros.hpp> -#include <Kokkos_Atomic.hpp> - -#include <cstdint> - -#include <type_traits> - -namespace Kokkos { -namespace Impl { - -enum class WaitMode : int { - ACTIVE // Used for tight loops to keep threads active longest - , - PASSIVE // Used to quickly yield the thread to quite down the system - , - ROOT // Never sleep or yield the root thread -}; - -void host_thread_yield(const uint32_t i, const WaitMode mode); - -template <typename T> -std::enable_if_t<std::is_integral<T>::value, void> root_spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template <typename T> -std::enable_if_t<std::is_integral<T>::value, void> root_spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template <typename T> -std::enable_if_t<std::is_integral<T>::value, void> spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template <typename T> -std::enable_if_t<std::is_integral<T>::value, void> yield_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -template <typename T> -std::enable_if_t<std::is_integral<T>::value, void> spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template <typename T> -std::enable_if_t<std::is_integral<T>::value, void> yield_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp index b287510b386685822302b32918b76442af76f5f2..483b91711ce87edc5249c57820b8842b1f599a4b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp @@ -78,12 +78,12 @@ std::string demangle(const std::string& name) { class Stacktrace { public: - Stacktrace() = delete; - Stacktrace(const Stacktrace&) = delete; + Stacktrace() = delete; + Stacktrace(const Stacktrace&) = delete; Stacktrace& operator=(const Stacktrace&) = delete; Stacktrace(Stacktrace&&) = delete; - Stacktrace& operator=(Stacktrace&&) = delete; - ~Stacktrace() = delete; + Stacktrace& operator=(Stacktrace&&) = delete; + ~Stacktrace() = delete; // These are public only to avoid wasting an extra stacktrace line. // See save_stacktrace below. diff --git a/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp b/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp index 231cc2c39c44980bbfc2a49ee3abca5280b87c5b..bc9ba3bd572974c009590aaaff353de73a9fb884 100644 --- a/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_StringManipulation.hpp @@ -123,8 +123,8 @@ KOKKOS_INLINE_FUNCTION constexpr char *strncat(char *dest, const char *src, template <class Unsigned> KOKKOS_FUNCTION constexpr unsigned int to_chars_len(Unsigned val) { unsigned int const base = 10; - static_assert(std::is_integral<Unsigned>::value, "implementation bug"); - static_assert(std::is_unsigned<Unsigned>::value, "implementation bug"); + static_assert(std::is_integral_v<Unsigned>, "implementation bug"); + static_assert(std::is_unsigned_v<Unsigned>, "implementation bug"); unsigned int n = 1; while (val >= base) { val /= base; @@ -136,8 +136,8 @@ template <class Unsigned> KOKKOS_FUNCTION constexpr void to_chars_impl(char *first, unsigned int len, Unsigned val) { unsigned int const base = 10; - static_assert(std::is_integral<Unsigned>::value, "implementation bug"); - static_assert(std::is_unsigned<Unsigned>::value, "implementation bug"); + static_assert(std::is_integral_v<Unsigned>, "implementation bug"); + static_assert(std::is_unsigned_v<Unsigned>, "implementation bug"); unsigned int pos = len - 1; while (val > 0) { auto const num = val % base; @@ -167,7 +167,7 @@ KOKKOS_FUNCTION constexpr to_chars_result to_chars_i(char *first, char *last, if (value == 0) { *first = '0'; return {first + 1, {}}; - } else if constexpr (std::is_signed<Integral>::value) { + } else if constexpr (std::is_signed_v<Integral>) { if (value < 0) { *first++ = '-'; unsigned_val = Unsigned(~value) + Unsigned(1); diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp index ed548e99a89f9764033bc9dff565c56f86b6f464..14569086ec2650243c31383c0706b98255762333 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp @@ -20,6 +20,11 @@ #define KOKKOS_IMPL_TASKBASE_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> @@ -33,6 +38,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -133,9 +143,9 @@ class TaskBase { int16_t m_task_type; ///< Type of task int16_t m_priority; ///< Priority of runnable task - TaskBase(TaskBase&&) = delete; - TaskBase(const TaskBase&) = delete; - TaskBase& operator=(TaskBase&&) = delete; + TaskBase(TaskBase&&) = delete; + TaskBase(const TaskBase&) = delete; + TaskBase& operator=(TaskBase&&) = delete; TaskBase& operator=(const TaskBase&) = delete; KOKKOS_DEFAULTED_FUNCTION ~TaskBase() = default; @@ -246,10 +256,10 @@ namespace Impl { template <class Scheduler, typename ResultType, class FunctorType> class Task : public TaskBase, public FunctorType { public: - Task() = delete; - Task(Task&&) = delete; - Task(const Task&) = delete; - Task& operator=(Task&&) = delete; + Task() = delete; + Task(Task&&) = delete; + Task(const Task&) = delete; + Task& operator=(Task&&) = delete; Task& operator=(const Task&) = delete; using root_type = TaskBase; @@ -313,6 +323,10 @@ class Task : public TaskBase, public FunctorType { } /* namespace Impl */ } /* namespace Kokkos */ +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp index a81f298bbf260f62997af5319c816816750367ec..919533e8022d640bdfbf85a37f0b6d20f7ee0d39 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp @@ -20,6 +20,11 @@ #define KOKKOS_IMPL_TASKNODE_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> @@ -39,6 +44,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -174,11 +184,11 @@ class TaskNode m_priority(static_cast<priority_type>(priority)), m_is_respawning(false) {} - TaskNode() = delete; - TaskNode(TaskNode const&) = delete; - TaskNode(TaskNode&&) = delete; + TaskNode() = delete; + TaskNode(TaskNode const&) = delete; + TaskNode(TaskNode&&) = delete; TaskNode& operator=(TaskNode const&) = delete; - TaskNode& operator=(TaskNode&&) = delete; + TaskNode& operator=(TaskNode&&) = delete; KOKKOS_INLINE_FUNCTION bool is_aggregate() const noexcept { @@ -652,6 +662,10 @@ class alignas(16) RunnableTask } /* namespace Kokkos */ +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp index 9e6a55b3f6726c913f26040bd4bf746bf7ced1b2..263835a35b36c70fd557fbd906e7349192cc7c4a 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp @@ -20,6 +20,11 @@ //---------------------------------------------------------------------------- #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_Core_fwd.hpp> @@ -28,6 +33,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -127,7 +137,7 @@ struct TaskPolicyWithScheduler { KOKKOS_INLINE_FUNCTION static constexpr bool has_predecessor() noexcept { - return !std::is_same<PredecessorFuture, std::nullptr_t>::value; + return !std::is_same_v<PredecessorFuture, std::nullptr_t>; } KOKKOS_INLINE_FUNCTION @@ -137,6 +147,10 @@ struct TaskPolicyWithScheduler { } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp index 8312fbc1036737938313fd457c2711421b48d66e..06f2fac4542fab731480e8a3f192653ca3592328 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp @@ -20,6 +20,11 @@ #define KOKKOS_IMPL_TASKQUEUE_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> @@ -40,6 +45,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -87,10 +97,10 @@ class TaskQueue : public TaskQueueBase { //---------------------------------------- ~TaskQueue(); - TaskQueue() = delete; - TaskQueue(TaskQueue&&) = delete; - TaskQueue(TaskQueue const&) = delete; - TaskQueue& operator=(TaskQueue&&) = delete; + TaskQueue() = delete; + TaskQueue(TaskQueue&&) = delete; + TaskQueue(TaskQueue const&) = delete; + TaskQueue& operator=(TaskQueue&&) = delete; TaskQueue& operator=(TaskQueue const&) = delete; TaskQueue(const memory_pool& arg_memory_pool); @@ -206,6 +216,10 @@ class TaskQueue : public TaskQueueBase { } /* namespace Impl */ } /* namespace Kokkos */ +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp index 3709d6e7209e30666404acdb115577067bd7ab08..3440d4b06b2cf4af2d6092f302ef8a686eced4d3 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp @@ -18,6 +18,11 @@ #define KOKKOS_IMPL_TASKQUEUECOMMON_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> @@ -455,10 +460,9 @@ class TaskQueueCommonMixin { // && Same<MemoryPool, typename Derived::memory_pool> { static_assert( - std::is_same<ExecutionSpace, - typename Derived::execution_space>::value && - std::is_same<MemorySpace, typename Derived::memory_space>::value && - std::is_same<MemoryPool, typename Derived::memory_pool>::value, + std::is_same_v<ExecutionSpace, typename Derived::execution_space> && + std::is_same_v<MemorySpace, typename Derived::memory_space> && + std::is_same_v<MemoryPool, typename Derived::memory_pool>, "Type mismatch in task_queue_allocation_size customization point"); return sizeof(Derived); diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp index e2bb9d2b61a10943e00e4c29e08b2c849cbb44c6..e5886c949357850398ee525f6aab65073eac2d44 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp @@ -18,6 +18,11 @@ #define KOKKOS_IMPL_TASKQUEUEMEMORYMANAGER_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp index 4ed057a689727e9df8d16291d1b54f78237e1a6e..54ad4e775571a0279f4a7c8689ef19f39bc25ea4 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp @@ -20,6 +20,11 @@ #define KOKKOS_IMPL_TASKQUEUEMULTIPLE_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> @@ -40,6 +45,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -178,11 +188,11 @@ class LeagueQueueCollection { int m_size = static_cast<int>(KOKKOS_INVALID_INDEX); public: - LeagueQueueCollection() = delete; - LeagueQueueCollection(LeagueQueueCollection const&) = delete; - LeagueQueueCollection(LeagueQueueCollection&&) = delete; + LeagueQueueCollection() = delete; + LeagueQueueCollection(LeagueQueueCollection const&) = delete; + LeagueQueueCollection(LeagueQueueCollection&&) = delete; LeagueQueueCollection& operator=(LeagueQueueCollection const&) = delete; - LeagueQueueCollection& operator=(LeagueQueueCollection&&) = delete; + LeagueQueueCollection& operator=(LeagueQueueCollection&&) = delete; ~LeagueQueueCollection() { // destroy only the initialized queues that we own @@ -235,6 +245,10 @@ class LeagueQueueCollection { } /* namespace Impl */ } /* namespace Kokkos */ +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp index 114483ca0a5b4d74367ac66211d8a3445f97b783..99e0e4d4091eb26ce2b9bba5729e9aec191ae750 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp @@ -18,6 +18,11 @@ #define KOKKOS_IMPL_TASKQUEUEMULTIPLE_IMPL_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <impl/Kokkos_TaskQueueMultiple.hpp> diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp index 074dc7bb983f01f7e73a97a83881dc153f45a9ea..b0a665bb3a9e4d1ed4d48d254ff1a34c11928ea0 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp @@ -17,6 +17,11 @@ #ifndef KOKKOS_IMPL_TASKQUEUE_IMPL_HPP #define KOKKOS_IMPL_TASKQUEUE_IMPL_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0 diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp index 6161b945b495f53700913945b165ccbb79ea8216..38008e34abc8707e6ffaa63cec3bac733dba190c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp @@ -20,6 +20,11 @@ #define KOKKOS_IMPL_TASKRESULT_HPP #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_TaskScheduler_fwd.hpp> diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp index 7dc34cc065a1a5ce7a41b43da7273c7b36a12827..314e1c704ed8f99da63fe17a1b22c1b3d98bab9c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp @@ -20,6 +20,11 @@ //---------------------------------------------------------------------------- #include <Kokkos_Macros.hpp> + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #if defined(KOKKOS_ENABLE_TASKDAG) #include <Kokkos_Core_fwd.hpp> @@ -52,8 +57,7 @@ class TaskTeamMemberAdapter : public TeamMember { // type that we're adapting template <typename... Args> KOKKOS_INLINE_FUNCTION explicit TaskTeamMemberAdapter( - std::enable_if_t<std::is_constructible<TeamMember, Args...>::value, - Scheduler> + std::enable_if_t<std::is_constructible_v<TeamMember, Args...>, Scheduler> arg_scheduler, Args&&... args) // TODO @tasking @minor DSH noexcept specification : TeamMember(std::forward<Args>(args)...), diff --git a/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp index a77e139ec3051eca1b0553a50a21f5c850030b55..67ed3cacd6aabc83e8df8ec7777460a01f04777a 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp @@ -36,6 +36,10 @@ namespace Impl { static std::map<std::string, Kokkos::Tools::Experimental::TeamSizeTuner> team_tuners; +static std::map<std::string, + Kokkos::Tools::Experimental::RangePolicyOccupancyTuner> + range_policy_tuners; + template <int Rank> using MDRangeTuningMap = std::map<std::string, Kokkos::Tools::Experimental::MDRangeTuner<Rank>>; @@ -45,13 +49,17 @@ static MDRangeTuningMap<Rank> mdrange_tuners; // For any policies without a tuning implementation, with a reducer template <class ReducerType, class ExecPolicy, class Functor, typename TagType> -void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&, - TagType) {} +auto tune_policy(const size_t, const std::string&, const ExecPolicy& policy, + const Functor&, TagType) { + return policy; +} // For any policies without a tuning implementation, without a reducer template <class ExecPolicy, class Functor, typename TagType> -void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&, - const TagType&) {} +auto tune_policy(const size_t, const std::string&, const ExecPolicy& policy, + const Functor&, const TagType&) { + return policy; +} /** * Tuning for parallel_fors and parallel_scans is a fairly simple process. @@ -81,6 +89,14 @@ struct SimpleTeamSizeCalculator { auto max = policy.team_size_max(functor, tag); return max; } + template <typename Policy, typename FunctorReducer> + int get_max_team_size(const Policy& policy, + const FunctorReducer& functor_reducer, + const Kokkos::ParallelReduceTag tag) { + auto max = policy.team_size_max(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); + return max; + } template <typename Policy, typename Functor, typename Tag> int get_recommended_team_size(const Policy& policy, const Functor& functor, const Tag tag) { @@ -95,18 +111,14 @@ struct SimpleTeamSizeCalculator { using driver = Kokkos::Impl::ParallelFor<Functor, Policy, exec_space>; return driver::max_tile_size_product(policy, functor); } - template <typename Policy, typename Functor> + template <typename Policy, typename FunctorReducer> int get_mdrange_max_tile_size_product(const Policy& policy, - const Functor& functor, + const FunctorReducer& functor_reducer, const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; - using analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, Functor, void>; - using driver = typename Kokkos::Impl::ParallelReduce< - Kokkos::Impl::CombinedFunctorReducer<Functor, - typename analysis::Reducer>, - Policy, exec_space>; - return driver::max_tile_size_product(policy, functor); + using driver = + Kokkos::Impl::ParallelReduce<FunctorReducer, Policy, exec_space>; + return driver::max_tile_size_product(policy, functor_reducer.get_functor()); } }; @@ -116,58 +128,43 @@ struct SimpleTeamSizeCalculator { // constructible from a reference to an // instance of their value_type so we construct // a value_type and temporary reducer here -template <typename ReducerType> struct ComplexReducerSizeCalculator { - template <typename Policy, typename Functor, typename Tag> - int get_max_team_size(const Policy& policy, const Functor& functor, - const Tag tag) { - using value_type = typename ReducerType::value_type; - value_type value; - ReducerType reducer_example = ReducerType(value); - - using Analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, - value_type>; - typename Analysis::Reducer final_reducer(reducer_example); - - return policy.team_size_max(functor, final_reducer, tag); + template <typename Policy, typename FunctorReducer, typename Tag> + int get_max_team_size(const Policy& policy, + const FunctorReducer& functor_reducer, const Tag tag) { + return policy.team_size_max(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); } - template <typename Policy, typename Functor, typename Tag> - int get_recommended_team_size(const Policy& policy, const Functor& functor, + template <typename Policy, typename FunctorReducer, typename Tag> + int get_recommended_team_size(const Policy& policy, + const FunctorReducer& functor_reducer, const Tag tag) { - using value_type = typename ReducerType::value_type; - value_type value; - ReducerType reducer_example = ReducerType(value); - - using Analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, - value_type>; - typename Analysis::Reducer final_reducer(reducer_example); - - return policy.team_size_recommended(functor, final_reducer, tag); + return policy.team_size_recommended(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); } - template <typename Policy, typename Functor> + template <typename Policy, typename FunctorReducer> int get_mdrange_max_tile_size_product(const Policy& policy, - const Functor& functor, + const FunctorReducer& functor_reducer, const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; - using Analysis = Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, - void>; - using driver = typename Kokkos::Impl::ParallelReduce< - Kokkos::Impl::CombinedFunctorReducer<Functor, - typename Analysis::Reducer>, - Policy, exec_space>; - return driver::max_tile_size_product(policy, functor); + using driver = + Kokkos::Impl::ParallelReduce<FunctorReducer, Policy, exec_space>; + return driver::max_tile_size_product(policy, functor_reducer.get_functor()); } }; +template <typename Policy> +auto default_tuned_version_of(const Policy& policy) { + return policy; +} + } // namespace Impl template <class Tuner, class Functor, class TagType, class TuningPermissionFunctor, class Map, class Policy> -void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, - const Functor& functor, const TagType& tag, +auto generic_tune_policy(const std::string& label_in, Map& map, + const Policy& policy, const Functor& functor, + const TagType& tag, const TuningPermissionFunctor& should_tune) { if (should_tune(policy)) { std::string label = label_in; @@ -186,13 +183,15 @@ void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, } return my_tuner; }(); - tuner_iter->second.tune(policy); + return tuner_iter->second.tune(policy); } + return Impl::default_tuned_version_of(policy); } template <class Tuner, class ReducerType, class Functor, class TagType, class TuningPermissionFunctor, class Map, class Policy> -void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, - const Functor& functor, const TagType& tag, +auto generic_tune_policy(const std::string& label_in, Map& map, + const Policy& policy, const Functor& functor, + const TagType& tag, const TuningPermissionFunctor& should_tune) { if (should_tune(policy)) { std::string label = label_in; @@ -205,24 +204,23 @@ void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, auto tuner_iter = [&]() { auto my_tuner = map.find(label); if (my_tuner == map.end()) { - return (map.emplace( - label, - Tuner(label, policy, functor, tag, - Impl::ComplexReducerSizeCalculator<ReducerType>{})) + return (map.emplace(label, Tuner(label, policy, functor, tag, + Impl::ComplexReducerSizeCalculator{})) .first); } return my_tuner; }(); - tuner_iter->second.tune(policy); + return tuner_iter->second.tune(policy); } + return Impl::default_tuned_version_of(policy); } // tune a TeamPolicy, without reducer template <class Functor, class TagType, class... Properties> -void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, - Kokkos::TeamPolicy<Properties...>& policy, +auto tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + const Kokkos::TeamPolicy<Properties...>& policy, const Functor& functor, const TagType& tag) { - generic_tune_policy<Experimental::TeamSizeTuner>( + return generic_tune_policy<Experimental::TeamSizeTuner>( label_in, team_tuners, policy, functor, tag, [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) { return (candidate_policy.impl_auto_team_size() || @@ -232,10 +230,10 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, // tune a TeamPolicy, with reducer template <class ReducerType, class Functor, class TagType, class... Properties> -void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, - Kokkos::TeamPolicy<Properties...>& policy, +auto tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + const Kokkos::TeamPolicy<Properties...>& policy, const Functor& functor, const TagType& tag) { - generic_tune_policy<Experimental::TeamSizeTuner, ReducerType>( + return generic_tune_policy<Experimental::TeamSizeTuner, ReducerType>( label_in, team_tuners, policy, functor, tag, [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) { return (candidate_policy.impl_auto_team_size() || @@ -243,14 +241,97 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, }); } +template <class Functor, class TagType, class... Properties> +auto tune_occupancy_controlled_policy( + const size_t /**tuning_context*/, const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, const Functor& functor, + const TagType& tag) { + return generic_tune_policy<Experimental::RangePolicyOccupancyTuner>( + label_in, range_policy_tuners, policy, functor, tag, + [](const Kokkos::RangePolicy<Properties...>& candidate_policy) { + return candidate_policy.impl_get_occupancy_control().should_tune(); + }); +} +template <class Functor, class TagType, class... Properties> +auto tune_range_policy(const size_t tuning_context, const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag, + std::true_type) { + return tune_occupancy_controlled_policy(tuning_context, label_in, policy, + functor, tag); +} +template <class Functor, class TagType, class... Properties> +auto tune_range_policy(const size_t /**tuning_context*/, + const std::string& /*label_in*/, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& /**functor*/, const TagType& /**tag*/, + std::false_type) { + return policy; +} + +// Reducer versions +template <class RT, class Functor, class TagType, class... Properties> +auto tune_occupancy_controlled_policy( + const size_t /**tuning_context*/, const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, const Functor& functor, + const TagType& tag) { + return generic_tune_policy<Experimental::RangePolicyOccupancyTuner>( + label_in, range_policy_tuners, policy, functor, tag, + [](const Kokkos::RangePolicy<Properties...>& candidate_policy) { + return candidate_policy.impl_get_occupancy_control().should_tune(); + }); +} +template <class RT, class Functor, class TagType, class... Properties> +auto tune_range_policy(const size_t tuning_context, const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag, + std::true_type) { + return tune_occupancy_controlled_policy<RT>(tuning_context, label_in, policy, + functor, tag); +} +template <class ReducerType, class Functor, class TagType, class... Properties> +auto tune_range_policy(const size_t /**tuning_context*/, + const std::string& /**label_in*/, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& /**functor*/, const TagType& /**tag*/, + std::false_type) { + return policy; +} + +// tune a RangePolicy, without reducer +template <class Functor, class TagType, class... Properties> +auto tune_policy(const size_t tuning_context, const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + using policy_t = Kokkos::RangePolicy<Properties...>; + using has_desired_occupancy = + typename std::is_same<typename policy_t::occupancy_control, + Kokkos::Experimental::DesiredOccupancy>::type; + return tune_range_policy(tuning_context, label_in, policy, functor, tag, + has_desired_occupancy{}); +} + +// tune a RangePolicy, with reducer +template <class ReducerType, class Functor, class TagType, class... Properties> +auto tune_policy(const size_t tuning_context, const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + using policy_t = Kokkos::RangePolicy<Properties...>; + using has_desired_occupancy = + typename std::is_same<typename policy_t::occupancy_control, + Kokkos::Experimental::DesiredOccupancy>::type; + return tune_range_policy<ReducerType>(tuning_context, label_in, policy, + functor, tag, has_desired_occupancy{}); +} + // tune a MDRangePolicy, without reducer template <class Functor, class TagType, class... Properties> -void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, - Kokkos::MDRangePolicy<Properties...>& policy, +auto tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + const Kokkos::MDRangePolicy<Properties...>& policy, const Functor& functor, const TagType& tag) { using Policy = Kokkos::MDRangePolicy<Properties...>; static constexpr int rank = Policy::rank; - generic_tune_policy<Experimental::MDRangeTuner<rank>>( + return generic_tune_policy<Experimental::MDRangeTuner<rank>>( label_in, mdrange_tuners<rank>, policy, functor, tag, [](const Policy& candidate_policy) { return candidate_policy.impl_tune_tile_size(); @@ -259,12 +340,12 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, // tune a MDRangePolicy, with reducer template <class ReducerType, class Functor, class TagType, class... Properties> -void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, - Kokkos::MDRangePolicy<Properties...>& policy, +auto tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + const Kokkos::MDRangePolicy<Properties...>& policy, const Functor& functor, const TagType& tag) { using Policy = Kokkos::MDRangePolicy<Properties...>; static constexpr int rank = Policy::rank; - generic_tune_policy<Experimental::MDRangeTuner<rank>, ReducerType>( + return generic_tune_policy<Experimental::MDRangeTuner<rank>, ReducerType>( label_in, mdrange_tuners<rank>, policy, functor, tag, [](const Policy& candidate_policy) { return candidate_policy.impl_tune_tile_size(); @@ -274,31 +355,35 @@ void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, template <class ReducerType> struct ReductionSwitcher { template <class Functor, class TagType, class ExecPolicy> - static void tune(const size_t tuning_context, const std::string& label, - ExecPolicy& policy, const Functor& functor, + static auto tune(const size_t tuning_context, const std::string& label, + const ExecPolicy& policy, const Functor& functor, const TagType& tag) { if (Kokkos::tune_internals()) { - tune_policy<ReducerType>(tuning_context, label, policy, functor, tag); + return tune_policy<ReducerType>(tuning_context, label, policy, functor, + tag); } + return Impl::default_tuned_version_of(policy); } }; template <> struct ReductionSwitcher<Kokkos::InvalidType> { template <class Functor, class TagType, class ExecPolicy> - static void tune(const size_t tuning_context, const std::string& label, - ExecPolicy& policy, const Functor& functor, + static auto tune(const size_t tuning_context, const std::string& label, + const ExecPolicy& policy, const Functor& functor, const TagType& tag) { if (Kokkos::tune_internals()) { - tune_policy(tuning_context, label, policy, functor, tag); + return tune_policy(tuning_context, label, policy, functor, tag); } + return Impl::default_tuned_version_of(policy); } }; template <class Tuner, class Functor, class TagType, class TuningPermissionFunctor, class Map, class Policy> void generic_report_results(const std::string& label_in, Map& map, - Policy& policy, const Functor&, const TagType&, + const Policy& policy, const Functor&, + const TagType&, const TuningPermissionFunctor& should_tune) { if (should_tune(policy)) { std::string label = label_in; @@ -315,14 +400,14 @@ void generic_report_results(const std::string& label_in, Map& map, // report results for a policy type we don't tune (do nothing) template <class ExecPolicy, class Functor, typename TagType> -void report_policy_results(const size_t, const std::string&, ExecPolicy&, +void report_policy_results(const size_t, const std::string&, const ExecPolicy&, const Functor&, const TagType&) {} // report results for a TeamPolicy template <class Functor, class TagType, class... Properties> void report_policy_results(const size_t /**tuning_context*/, const std::string& label_in, - Kokkos::TeamPolicy<Properties...>& policy, + const Kokkos::TeamPolicy<Properties...>& policy, const Functor& functor, const TagType& tag) { generic_report_results<Experimental::TeamSizeTuner>( label_in, team_tuners, policy, functor, tag, @@ -336,7 +421,7 @@ void report_policy_results(const size_t /**tuning_context*/, template <class Functor, class TagType, class... Properties> void report_policy_results(const size_t /**tuning_context*/, const std::string& label_in, - Kokkos::MDRangePolicy<Properties...>& policy, + const Kokkos::MDRangePolicy<Properties...>& policy, const Functor& functor, const TagType& tag) { using Policy = Kokkos::MDRangePolicy<Properties...>; static constexpr int rank = Policy::rank; @@ -347,6 +432,20 @@ void report_policy_results(const size_t /**tuning_context*/, }); } +// report results for an MDRangePolicy +template <class Functor, class TagType, class... Properties> +void report_policy_results(const size_t /**tuning_context*/, + const std::string& label_in, + const Kokkos::RangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + using Policy = Kokkos::RangePolicy<Properties...>; + generic_report_results<Experimental::RangePolicyOccupancyTuner>( + label_in, range_policy_tuners, policy, functor, tag, [](const Policy&) { + return Kokkos::RangePolicy< + Properties...>::traits::experimental_contains_desired_occupancy; + }); +} + } // namespace Impl } // namespace Experimental @@ -354,8 +453,11 @@ void report_policy_results(const size_t /**tuning_context*/, namespace Impl { template <class ExecPolicy, class FunctorType> -void begin_parallel_for(ExecPolicy& policy, FunctorType& functor, +auto begin_parallel_for(const ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { + using response_type = + Kokkos::Tools::Impl::ToolResponse<ExecPolicy, FunctorType>; + response_type response{policy}; if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> @@ -365,18 +467,19 @@ void begin_parallel_for(ExecPolicy& policy, FunctorType& functor, &kpID); } #ifdef KOKKOS_ENABLE_TUNING - size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); + size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); if (Kokkos::tune_internals()) { - Experimental::Impl::tune_policy(context_id, label, policy, functor, - Kokkos::ParallelForTag{}); + return response_type{Kokkos::Tools::Experimental::Impl::tune_policy( + context_id, label, policy, functor, Kokkos::ParallelForTag{})}; } #else (void)functor; #endif + return response; } template <class ExecPolicy, class FunctorType> -void end_parallel_for(ExecPolicy& policy, FunctorType& functor, +void end_parallel_for(const ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Tools::endParallelFor(kpID); @@ -395,8 +498,11 @@ void end_parallel_for(ExecPolicy& policy, FunctorType& functor, } template <class ExecPolicy, class FunctorType> -void begin_parallel_scan(ExecPolicy& policy, FunctorType& functor, +auto begin_parallel_scan(const ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { + using response_type = + Kokkos::Tools::Impl::ToolResponse<ExecPolicy, FunctorType>; + response_type response{policy}; if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> @@ -406,18 +512,19 @@ void begin_parallel_scan(ExecPolicy& policy, FunctorType& functor, &kpID); } #ifdef KOKKOS_ENABLE_TUNING - size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); + size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); if (Kokkos::tune_internals()) { - Experimental::Impl::tune_policy(context_id, label, policy, functor, - Kokkos::ParallelScanTag{}); + return response_type{Kokkos::Tools::Experimental::Impl::tune_policy( + context_id, label, policy, functor, Kokkos::ParallelScanTag{})}; } #else (void)functor; #endif + return response; } template <class ExecPolicy, class FunctorType> -void end_parallel_scan(ExecPolicy& policy, FunctorType& functor, +void end_parallel_scan(const ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Tools::endParallelScan(kpID); @@ -436,8 +543,10 @@ void end_parallel_scan(ExecPolicy& policy, FunctorType& functor, } template <class ReducerType, class ExecPolicy, class FunctorType> -void begin_parallel_reduce(ExecPolicy& policy, FunctorType& functor, +auto begin_parallel_reduce(const ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { + using response_type = ToolResponse<ExecPolicy, FunctorType>; + response_type response{policy}; if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> @@ -447,16 +556,17 @@ void begin_parallel_reduce(ExecPolicy& policy, FunctorType& functor, &kpID); } #ifdef KOKKOS_ENABLE_TUNING - size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); - Experimental::Impl::ReductionSwitcher<ReducerType>::tune( - context_id, label, policy, functor, Kokkos::ParallelReduceTag{}); + size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); + return response_type{Experimental::Impl::ReductionSwitcher<ReducerType>::tune( + context_id, label, policy, functor, Kokkos::ParallelReduceTag{})}; #else (void)functor; #endif + return response; } template <class ReducerType, class ExecPolicy, class FunctorType> -void end_parallel_reduce(ExecPolicy& policy, FunctorType& functor, +void end_parallel_reduce(const ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Tools::endParallelReduce(kpID); @@ -474,7 +584,7 @@ void end_parallel_reduce(ExecPolicy& policy, FunctorType& functor, #endif } -} // end namespace Impl +} // namespace Impl } // namespace Tools diff --git a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp index 45ee60e931d882379900036669effe9d09120650..fdbd0de03f3bf44a3d08dd793782d03da5d4df47 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp @@ -53,7 +53,7 @@ struct has_type { template <typename T, typename S, typename... Pack> struct has_type<T, S, Pack...> { private: - enum { self_value = std::is_same<T, S>::value }; + enum { self_value = std::is_same_v<T, S> }; using next = has_type<T, Pack...>; @@ -102,8 +102,7 @@ struct are_integral<T, Args...> { // Accept std::is_integral OR std::is_enum as an integral value // since a simple enum value is automically convertible to an // integral value. - (std::is_integral<T>::value || std::is_enum<T>::value) && - are_integral<Args...>::value + (std::is_integral_v<T> || std::is_enum_v<T>)&&are_integral<Args...>::value }; }; diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp index 7e2f130564fefa47d8591c69de5e1cee29e5f617..cadeed1a6d84099672074115cd9acdff651f5d94 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp @@ -49,6 +49,11 @@ struct integral_constant { template <typename... Is> struct always_true : std::true_type {}; +// type-dependent expression that is always false intended for use in +// static_assert to check "we should never get there" +template <typename... Deps> +struct always_false : std::false_type {}; + //============================================================================== #if defined(__cpp_lib_type_identity) diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp deleted file mode 100644 index 725ba5de092a82ab9c486c43d028c57192eaeadf..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ /dev/null @@ -1,617 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP -#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP - -#include <Kokkos_Array.hpp> - -namespace Kokkos { -namespace Impl { - -template <class DataType, class ArrayLayout, class V, size_t N, class P> -struct ViewDataAnalysis<DataType, ArrayLayout, Kokkos::Array<V, N, P>> { - private: - using array_analysis = ViewArrayAnalysis<DataType>; - - static_assert(std::is_void<P>::value, ""); - static_assert(std::is_same<typename array_analysis::non_const_value_type, - Kokkos::Array<V, N, P>>::value, - ""); - static_assert(std::is_scalar<V>::value, - "View of Array type must be of a scalar type"); - - public: - using specialize = Kokkos::Array<>; - - using dimension = typename array_analysis::dimension; - - private: - enum { - is_const = std::is_same<typename array_analysis::value_type, - typename array_analysis::const_value_type>::value - }; - - using array_scalar_dimension = typename dimension::template append<N>::type; - - using scalar_type = std::conditional_t<is_const, const V, V>; - using non_const_scalar_type = V; - using const_scalar_type = const V; - - public: - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - using type = typename ViewDataType<value_type, dimension>::type; - using const_type = typename ViewDataType<const_value_type, dimension>::type; - using non_const_type = - typename ViewDataType<non_const_value_type, dimension>::type; - - using scalar_array_type = - typename ViewDataType<scalar_type, array_scalar_dimension>::type; - using const_scalar_array_type = - typename ViewDataType<const_scalar_type, array_scalar_dimension>::type; - using non_const_scalar_array_type = - typename ViewDataType<non_const_scalar_type, - array_scalar_dimension>::type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -/** \brief View mapping for non-specialized data type and standard layout */ -template <class Traits> -class ViewMapping<Traits, Kokkos::Array<>> { - private: - template <class, class...> - friend class ViewMapping; - template <class, class...> - friend class Kokkos::View; - - using offset_type = ViewOffset<typename Traits::dimension, - typename Traits::array_layout, void>; - - using handle_type = typename Traits::value_type::pointer; - - handle_type m_impl_handle; - offset_type m_impl_offset; - size_t m_stride = 0; - - using scalar_type = typename Traits::value_type::value_type; - - using contiguous_reference = Kokkos::Array<scalar_type, (~std::size_t(0)), - Kokkos::Array<>::contiguous>; - using strided_reference = - Kokkos::Array<scalar_type, (~std::size_t(0)), Kokkos::Array<>::strided>; - - enum { - is_contiguous_reference = - (Traits::rank == 0) || (std::is_same<typename Traits::array_layout, - Kokkos::LayoutRight>::value) - }; - - enum { Array_N = Traits::value_type::size() }; - enum { Array_S = is_contiguous_reference ? Array_N : 1 }; - - KOKKOS_INLINE_FUNCTION - ViewMapping(const handle_type &arg_handle, const offset_type &arg_offset) - : m_impl_handle(arg_handle), - m_impl_offset(arg_offset), - m_stride(is_contiguous_reference ? 0 : arg_offset.span()) {} - - public: - //---------------------------------------- - // Domain dimensions - - static constexpr unsigned Rank = Traits::dimension::rank; - - template <typename iType> - KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const { - return m_impl_offset.m_dim.extent(r); - } - - KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() - const { - return m_impl_offset.layout(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { - return m_impl_offset.dimension_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { - return m_impl_offset.dimension_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { - return m_impl_offset.dimension_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { - return m_impl_offset.dimension_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { - return m_impl_offset.dimension_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { - return m_impl_offset.dimension_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { - return m_impl_offset.dimension_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { - return m_impl_offset.dimension_7(); - } - - // Is a regular layout with uniform striding for each index. - using is_regular = typename offset_type::is_regular; - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_impl_offset.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_impl_offset.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_impl_offset.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_impl_offset.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_impl_offset.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_impl_offset.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_impl_offset.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_impl_offset.stride_7(); - } - - //---------------------------------------- - // Range span - - /** \brief Span of the mapped range */ - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { - return m_impl_offset.span() * Array_N; - } - - /** \brief Is the mapped range span contiguous */ - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_impl_offset.span_is_contiguous(); - } - - using reference_type = - std::conditional_t<is_contiguous_reference, contiguous_reference, - strided_reference>; - - using pointer_type = handle_type; - - /** \brief If data references are lvalue_reference than can query pointer to - * memory */ - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_impl_handle; - } - - //---------------------------------------- - // The View class performs all rank and bounds checking before - // calling these element reference methods. - - KOKKOS_FORCEINLINE_FUNCTION - reference_type reference() const { - return reference_type(m_impl_handle + 0, Array_N, 0); - } - - template <typename I0> - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0) const { - return reference_type(m_impl_handle + m_impl_offset(i0) * Array_S, Array_N, - m_stride); - } - - template <typename I0, typename I1> - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1) * Array_S, - Array_N, m_stride); - } - - template <typename I0, typename I1, typename I2> - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2) const { - return reference_type(m_impl_handle + m_impl_offset(i0, i1, i2) * Array_S, - Array_N, m_stride); - } - - template <typename I0, typename I1, typename I2, typename I3> - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3) * Array_S, Array_N, - m_stride); - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4> - KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, - const I1 &i1, - const I2 &i2, - const I3 &i3, - const I4 &i4) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4) * Array_S, Array_N, - m_stride); - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5> - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5) * Array_S, - Array_N, m_stride); - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6> - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6) * Array_S, - Array_N, m_stride); - } - - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename I7> - KOKKOS_FORCEINLINE_FUNCTION reference_type - reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7) const { - return reference_type( - m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7) * Array_S, - Array_N, m_stride); - } - - //---------------------------------------- - - private: - enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; - enum { MemorySpanSize = sizeof(scalar_type) }; - - public: - /** \brief Span, in bytes, of the referenced memory */ - KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const { - return (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - } - - //---------------------------------------- - - KOKKOS_DEFAULTED_FUNCTION ViewMapping() = default; - - //---------------------------------------- - - template <class... Args> - KOKKOS_INLINE_FUNCTION ViewMapping(pointer_type ptr, Args... args) - : m_impl_handle(ptr), - m_impl_offset(std::integral_constant<unsigned, 0>(), args...), - m_stride(m_impl_offset.span()) {} - - //---------------------------------------- - - template <class... P> - Kokkos::Impl::SharedAllocationRecord<> *allocate_shared( - Kokkos::Impl::ViewCtorProp<P...> const &arg_prop, - typename Traits::array_layout const &arg_layout, - bool execution_space_specified) { - using alloc_prop = Kokkos::Impl::ViewCtorProp<P...>; - - using execution_space = typename alloc_prop::execution_space; - using memory_space = typename Traits::memory_space; - static_assert( - SpaceAccessibility<execution_space, memory_space>::accessible); - using functor_type = - ViewValueFunctor<typename Traits::device_type, scalar_type>; - using record_type = - Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>; - - // Query the mapping for byte-size of allocation. - using padding = std::integral_constant< - unsigned int, alloc_prop::allow_padding ? sizeof(scalar_type) : 0>; - - m_impl_offset = offset_type(padding(), arg_layout); - - const size_t alloc_size = - (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & - ~size_t(MemorySpanMask); - const auto &alloc_name = Impl::get_property<Impl::LabelTag>(arg_prop); - const execution_space &exec_space = - Impl::get_property<Impl::ExecutionSpaceTag>(arg_prop); - const memory_space &mem_space = - Impl::get_property<Impl::MemorySpaceTag>(arg_prop); - - // Allocate memory from the memory space and create tracking record. - record_type *const record = - execution_space_specified - ? record_type::allocate(exec_space, mem_space, alloc_name, - alloc_size) - : record_type::allocate(mem_space, alloc_name, alloc_size); - - m_impl_handle = handle_type(reinterpret_cast<pointer_type>(record->data())); - - functor_type functor = - execution_space_specified - ? functor_type(exec_space, (pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name) - : functor_type((pointer_type)m_impl_handle, - m_impl_offset.span() * Array_N, alloc_name); - -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) - if (false) { - // Make sure the destroy functor gets instantiated. - // This avoids "cudaErrorInvalidDeviceFunction"-type errors. - functor.destroy_shared_allocation(); - } -#endif - - // Only initialize if the allocation is non-zero. - // May be zero if one of the dimensions is zero. - if constexpr (alloc_prop::initialize) - if (alloc_size) { - // Assume destruction is only required when construction is requested. - // The ViewValueFunctor has both value construction and destruction - // operators. - record->m_destroy = std::move(functor); - - // Construct values - record->m_destroy.construct_shared_allocation(); - } - - return record; - } -}; - -/** \brief Assign Array to non-Array */ - -template <class DstTraits, class SrcTraits> -class ViewMapping< - DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same<typename DstTraits::memory_space, - typename SrcTraits::memory_space>::value && - std::is_void<typename DstTraits::specialize>::value && - (std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename DstTraits::array_layout, - Kokkos::LayoutStride>::value) && - std::is_same<typename SrcTraits::specialize, Kokkos::Array<>>::value && - (std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value))>> { - public: - // Can only convert to View::array_type - - enum { - is_assignable_data_type = - std::is_same<typename DstTraits::data_type, - typename SrcTraits::scalar_array_type>::value && - (DstTraits::rank == SrcTraits::rank + 1) - }; - enum { - is_assignable = - std::is_same<typename DstTraits::data_type, - typename SrcTraits::scalar_array_type>::value && - std::is_same<typename DstTraits::array_layout, - typename SrcTraits::array_layout>::value - }; - - using TrackType = Kokkos::Impl::SharedAllocationTracker; - using DstType = ViewMapping<DstTraits, void>; - using SrcType = ViewMapping<SrcTraits, Kokkos::Array<>>; - - KOKKOS_INLINE_FUNCTION - static void assign(DstType &dst, const SrcType &src, - const TrackType & /*src_track*/) { - static_assert(is_assignable, "Can only convert to array_type"); - - using dst_offset_type = typename DstType::offset_type; - - // Array dimension becomes the last dimension. - // Arguments beyond the destination rank are ignored. - if (src.span_is_contiguous()) { // not padded - dst.m_impl_offset = dst_offset_type( - std::integral_constant<unsigned, 0>(), - typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } else { // is padded - using padded = std::integral_constant< - unsigned int, sizeof(typename SrcTraits::value_type::value_type)>; - - dst.m_impl_offset = dst_offset_type( - padded(), typename DstTraits::array_layout( - (0 < SrcType::Rank ? src.dimension_0() - : SrcTraits::value_type::size()), - (1 < SrcType::Rank ? src.dimension_1() - : SrcTraits::value_type::size()), - (2 < SrcType::Rank ? src.dimension_2() - : SrcTraits::value_type::size()), - (3 < SrcType::Rank ? src.dimension_3() - : SrcTraits::value_type::size()), - (4 < SrcType::Rank ? src.dimension_4() - : SrcTraits::value_type::size()), - (5 < SrcType::Rank ? src.dimension_5() - : SrcTraits::value_type::size()), - (6 < SrcType::Rank ? src.dimension_6() - : SrcTraits::value_type::size()), - (7 < SrcType::Rank ? src.dimension_7() - : SrcTraits::value_type::size()))); - } - - dst.m_impl_handle = src.m_impl_handle; - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template <class SrcTraits, class... Args> -class ViewMapping< - std::enable_if_t<( - std::is_same<typename SrcTraits::specialize, Kokkos::Array<>>::value && - (std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value || - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutStride>::value))>, - SrcTraits, Args...> { - private: - static_assert(SrcTraits::rank == sizeof...(Args), ""); - - enum : bool { - R0 = is_integral_extent<0, Args...>::value, - R1 = is_integral_extent<1, Args...>::value, - R2 = is_integral_extent<2, Args...>::value, - R3 = is_integral_extent<3, Args...>::value, - R4 = is_integral_extent<4, Args...>::value, - R5 = is_integral_extent<5, Args...>::value, - R6 = is_integral_extent<6, Args...>::value, - R7 = is_integral_extent<7, Args...>::value - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) - }; - - // Whether right-most rank is a range. - enum { - R0_rev = - 0 == SrcTraits::rank - ? false - : (1 == SrcTraits::rank - ? R0 - : (2 == SrcTraits::rank - ? R1 - : (3 == SrcTraits::rank - ? R2 - : (4 == SrcTraits::rank - ? R3 - : (5 == SrcTraits::rank - ? R4 - : (6 == SrcTraits::rank - ? R5 - : (7 == SrcTraits::rank - ? R6 - : R7))))))) - }; - - // Subview's layout - using array_layout = - std::conditional_t<((rank == 0) || - (rank <= 2 && R0 && - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutLeft>::value) || - (rank <= 2 && R0_rev && - std::is_same<typename SrcTraits::array_layout, - Kokkos::LayoutRight>::value)), - typename SrcTraits::array_layout, - Kokkos::LayoutStride>; - - using value_type = typename SrcTraits::value_type; - - using data_type = std::conditional_t< - rank == 0, value_type, - std::conditional_t< - rank == 1, value_type *, - std::conditional_t< - rank == 2, value_type **, - std::conditional_t< - rank == 3, value_type ***, - std::conditional_t< - rank == 4, value_type ****, - std::conditional_t< - rank == 5, value_type *****, - std::conditional_t< - rank == 6, value_type ******, - std::conditional_t<rank == 7, value_type *******, - value_type ********>>>>>>>>; - - public: - using traits_type = Kokkos::ViewTraits<data_type, array_layout, - typename SrcTraits::device_type, - typename SrcTraits::memory_traits>; - - using type = - Kokkos::View<data_type, array_layout, typename SrcTraits::device_type, - typename SrcTraits::memory_traits>; - - KOKKOS_INLINE_FUNCTION - static void assign(ViewMapping<traits_type, void> &dst, - ViewMapping<SrcTraits, void> const &src, Args... args) { - using DstType = ViewMapping<traits_type, void>; - - using dst_offset_type = typename DstType::offset_type; - using dst_handle_type = typename DstType::handle_type; - - const SubviewExtents<SrcTraits::rank, rank> extents(src.m_impl_offset.m_dim, - args...); - - dst.m_impl_offset = dst_offset_type(src.m_impl_offset, extents); - dst.m_impl_handle = dst_handle_type( - src.m_impl_handle + - src.m_impl_offset(extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6), extents.domain_offset(7))); - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp deleted file mode 100644 index 957717f973d1218236e91ba625781f1c12c35209..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp +++ /dev/null @@ -1,1425 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP -#define KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP - -#include <Kokkos_Layout.hpp> -#include <Kokkos_View.hpp> - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// View offset and mapping for tiled view's - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1> -struct is_array_layout<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, 0, 0, 0, 0, 0, 0, true>> - : public std::true_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2> -struct is_array_layout<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, ArgN2, 0, 0, 0, 0, 0, true>> - : public std::true_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2, unsigned ArgN3> -struct is_array_layout<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, 0, 0, 0, 0, true>> - : public std::true_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4> -struct is_array_layout<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, 0, 0, 0, true>> - : public std::true_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, - unsigned ArgN5> -struct is_array_layout<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, 0, 0, true>> - : public std::true_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, - unsigned ArgN5, unsigned ArgN6> -struct is_array_layout<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, 0, true>> - : public std::true_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, - unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> -struct is_array_layout< - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, - ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true>> - : public std::true_type {}; - -template <class L> -struct is_array_layout_tiled : public std::false_type {}; - -template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, - unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, - unsigned ArgN5, unsigned ArgN6, unsigned ArgN7, bool IsPowerTwo> -struct is_array_layout_tiled<Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, - IsPowerTwo>> : public std::true_type { -}; // Last template parameter "true" meaning this currently only supports - // powers-of-two - -namespace Impl { - -template <class Dimension, class Layout> -struct ViewOffset< - Dimension, Layout, - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && - is_array_layout<Layout>::value && - is_array_layout_tiled<Layout>::value)>> { - public: - static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern; - static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern; - - static constexpr int VORank = Dimension::rank; - - static constexpr unsigned SHIFT_0 = - Kokkos::Impl::integral_power_of_two(Layout::N0); - static constexpr unsigned SHIFT_1 = - Kokkos::Impl::integral_power_of_two(Layout::N1); - static constexpr unsigned SHIFT_2 = - Kokkos::Impl::integral_power_of_two(Layout::N2); - static constexpr unsigned SHIFT_3 = - Kokkos::Impl::integral_power_of_two(Layout::N3); - static constexpr unsigned SHIFT_4 = - Kokkos::Impl::integral_power_of_two(Layout::N4); - static constexpr unsigned SHIFT_5 = - Kokkos::Impl::integral_power_of_two(Layout::N5); - static constexpr unsigned SHIFT_6 = - Kokkos::Impl::integral_power_of_two(Layout::N6); - static constexpr unsigned SHIFT_7 = - Kokkos::Impl::integral_power_of_two(Layout::N7); - static constexpr int MASK_0 = Layout::N0 - 1; - static constexpr int MASK_1 = Layout::N1 - 1; - static constexpr int MASK_2 = Layout::N2 - 1; - static constexpr int MASK_3 = Layout::N3 - 1; - static constexpr int MASK_4 = Layout::N4 - 1; - static constexpr int MASK_5 = Layout::N5 - 1; - static constexpr int MASK_6 = Layout::N6 - 1; - static constexpr int MASK_7 = Layout::N7 - 1; - - static constexpr unsigned SHIFT_2T = SHIFT_0 + SHIFT_1; - static constexpr unsigned SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2; - static constexpr unsigned SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3; - static constexpr unsigned SHIFT_5T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4; - static constexpr unsigned SHIFT_6T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5; - static constexpr unsigned SHIFT_7T = - SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6; - static constexpr unsigned SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6 + SHIFT_7; - - // Is an irregular layout that does not have uniform striding for each index. - using is_mapping_plugin = std::true_type; - using is_regular = std::false_type; - - using size_type = size_t; - using dimension_type = Dimension; - using array_layout = Layout; - - dimension_type m_dim; - size_type m_tile_N0; // Num tiles dim 0 - size_type m_tile_N1; - size_type m_tile_N2; - size_type m_tile_N3; - size_type m_tile_N4; - size_type m_tile_N5; - size_type m_tile_N6; - size_type m_tile_N7; - - //---------------------------------------- - -#define KOKKOS_IMPL_DEBUG_OUTPUT_CHECK 0 - - // Rank 2 - template <typename I0, typename I1> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, - I1 const& i1) const { - auto tile_offset = - (outer_pattern == (Kokkos::Iterate::Left)) - ? (((i0 >> SHIFT_0) + m_tile_N0 * ((i1 >> SHIFT_1))) << SHIFT_2T) - : (((m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1))) << SHIFT_2T); - // ( num_tiles[1] * ti0 + ti1 ) * FTD - - auto local_offset = (inner_pattern == (Kokkos::Iterate::Left)) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0)) - : (((i0 & MASK_0) << SHIFT_1) + (i1 & MASK_1)); - // ( tile_dim[1] * li0 + li1 ) - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << "locali0 = " << (i0 & MASK_0) - << "\nlocali1 = " << (i1 & MASK_1) << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 3 - template <typename I0, typename I1, typename I2> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + m_tile_N1 * (i2 >> SHIFT_2))) - << SHIFT_3T) - : ((m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) - << SHIFT_3T); - - auto local_offset = (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1))) - : (((i0 & MASK_0) << (SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_2)) + (i2 & MASK_2)); - -#if KOKKOS_IMPL_DEBUG_OUTPUT_CHECK - std::cout << "Am I Outer Left? " - << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "Am I Inner Left? " - << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; - std::cout << "i0 = " << i0 << " i1 = " << i1 << " i2 = " << i2 - << "\ntilei0 = " << (i0 >> SHIFT_0) - << " tilei1 = " << (i1 >> SHIFT_1) - << " tilei2 = " << (i2 >> SHIFT_2) - << "\nlocali0 = " << (i0 & MASK_0) - << "locali1 = " << (i1 & MASK_1) << "locali2 = " << (i2 & MASK_2) - << std::endl; -#endif - - return tile_offset + local_offset; - } - - // Rank 4 - template <typename I0, typename I1, typename I2, typename I3> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, - I3 const& i3) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * (i3 >> SHIFT_3)))) - << SHIFT_4T) - : ((m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) - << SHIFT_4T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2))) - : (((i0 & MASK_0) << (SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_3)) + (i3 & MASK_3)); - - return tile_offset + local_offset; - } - - // Rank 5 - template <typename I0, typename I1, typename I2, typename I3, typename I4> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * ((i2 >> SHIFT_2) + - m_tile_N2 * ((i3 >> SHIFT_3) + - m_tile_N3 * (i4 >> SHIFT_4))))) - << SHIFT_5T) - : ((m_tile_N4 * - (m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) - << SHIFT_5T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3))) - : (((i0 & MASK_0) << (SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_4)) + (i4 & MASK_4)); - - return tile_offset + local_offset; - } - - // Rank 6 - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, - I5 const& i5) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * ((i4 >> SHIFT_4) + - m_tile_N4 * (i5 >> SHIFT_5)))))) - << SHIFT_6T) - : ((m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) - << SHIFT_6T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4))) - : (((i0 & MASK_0) - << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_5)) + (i5 & MASK_5)); - - return tile_offset + local_offset; - } - - // Rank 7 - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * (i6 >> SHIFT_6))))))) - << SHIFT_7T) - : ((m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) - << SHIFT_7T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5))) - : (((i0 & MASK_0) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + - SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) - << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_6)) + (i6 & MASK_6)); - - return tile_offset + local_offset; - } - - // Rank 8 - template <typename I0, typename I1, typename I2, typename I3, typename I4, - typename I5, typename I6, typename I7> - KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, - I2 const& i2, I3 const& i3, - I4 const& i4, I5 const& i5, - I6 const& i6, - I7 const& i7) const { - auto tile_offset = - (outer_pattern == Kokkos::Iterate::Left) - ? (((i0 >> SHIFT_0) + - m_tile_N0 * - ((i1 >> SHIFT_1) + - m_tile_N1 * - ((i2 >> SHIFT_2) + - m_tile_N2 * - ((i3 >> SHIFT_3) + - m_tile_N3 * - ((i4 >> SHIFT_4) + - m_tile_N4 * - ((i5 >> SHIFT_5) + - m_tile_N5 * - ((i6 >> SHIFT_6) + - m_tile_N6 * (i7 >> SHIFT_7)))))))) - << SHIFT_8T) - : ((m_tile_N7 * - (m_tile_N6 * - (m_tile_N5 * - (m_tile_N4 * - (m_tile_N3 * - (m_tile_N2 * - (m_tile_N1 * (i0 >> SHIFT_0) + - (i1 >> SHIFT_1)) + - (i2 >> SHIFT_2)) + - (i3 >> SHIFT_3)) + - (i4 >> SHIFT_4)) + - (i5 >> SHIFT_5)) + - (i6 >> SHIFT_6)) + - (i7 >> SHIFT_7)) - << SHIFT_8T); - - auto local_offset = - (inner_pattern == Kokkos::Iterate::Left) - ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + - ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + - ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + - ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + - ((i5 & MASK_5) - << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + - ((i6 & MASK_6) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5)) + - ((i7 & MASK_7) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + - SHIFT_4 + SHIFT_5 + SHIFT_6))) - : (((i0 & MASK_0) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2 + SHIFT_1)) + - ((i1 & MASK_1) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + - SHIFT_3 + SHIFT_2)) + - ((i2 & MASK_2) - << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + - ((i3 & MASK_3) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4)) + - ((i4 & MASK_4) << (SHIFT_7 + SHIFT_6 + SHIFT_5)) + - ((i5 & MASK_5) << (SHIFT_7 + SHIFT_6)) + - ((i6 & MASK_6) << (SHIFT_7)) + (i7 & MASK_7)); - - return tile_offset + local_offset; - } - - //---------------------------------------- - - KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { - return array_layout((VORank > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (VORank > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (VORank > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (VORank > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (VORank > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (VORank > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (VORank > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (VORank > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); - } - - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { - return m_dim.N0; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { - return m_dim.N1; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { - return m_dim.N2; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { - return m_dim.N3; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { - return m_dim.N4; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { - return m_dim.N5; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { - return m_dim.N6; - } - KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { - return m_dim.N7; - } - - KOKKOS_INLINE_FUNCTION constexpr size_type size() const { - return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * - m_dim.N6 * m_dim.N7; - } - - // Strides are meaningless due to irregularity - KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0; } - KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0; } - - // Stride with [ rank ] value is the total length - template <typename iType> - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - s[0] = 0; - if (0 < dimension_type::rank) { - s[1] = 0; - } - if (1 < dimension_type::rank) { - s[2] = 0; - } - if (2 < dimension_type::rank) { - s[3] = 0; - } - if (3 < dimension_type::rank) { - s[4] = 0; - } - if (4 < dimension_type::rank) { - s[5] = 0; - } - if (5 < dimension_type::rank) { - s[6] = 0; - } - if (6 < dimension_type::rank) { - s[7] = 0; - } - if (7 < dimension_type::rank) { - s[8] = 0; - } - } - - KOKKOS_INLINE_FUNCTION constexpr size_type span() const { - // Rank2: ( NumTile0 * ( NumTile1 ) ) * TileSize, etc - return (VORank == 2) - ? (m_tile_N0 * m_tile_N1) << SHIFT_2T - : (VORank == 3) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2) << SHIFT_3T - : (VORank == 4) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * m_tile_N3) - << SHIFT_4T - : (VORank == 5) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4) - << SHIFT_5T - : (VORank == 6) - ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * - m_tile_N3 * m_tile_N4 * m_tile_N5) - << SHIFT_6T - : (VORank == 7) - ? (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6) - << SHIFT_7T - : (m_tile_N0 * m_tile_N1 * - m_tile_N2 * m_tile_N3 * - m_tile_N4 * m_tile_N5 * - m_tile_N6 * m_tile_N7) - << SHIFT_8T; - } - - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return true; - } - - //---------------------------------------- -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_tile_N0 = src.m_tile_N0; - m_tile_N1 = src.m_tile_N1; - m_tile_N2 = src.m_tile_N2; - m_tile_N3 = src.m_tile_N3; - m_tile_N4 = src.m_tile_N4; - m_tile_N5 = src.m_tile_N5; - m_tile_N6 = src.m_tile_N6; - m_tile_N7 = src.m_tile_N7; - return *this; - } -#else - KOKKOS_DEFAULTED_FUNCTION ~ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset() = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset(const ViewOffset&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewOffset& operator=(const ViewOffset&) = default; -#endif - - template <unsigned TrivialScalarSize> - KOKKOS_INLINE_FUNCTION constexpr ViewOffset( - std::integral_constant<unsigned, TrivialScalarSize> const&, - array_layout const arg_layout) - : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], - arg_layout.dimension[2], arg_layout.dimension[3], - arg_layout.dimension[4], arg_layout.dimension[5], - arg_layout.dimension[6], arg_layout.dimension[7]), - m_tile_N0((arg_layout.dimension[0] + MASK_0) >> - SHIFT_0 /* number of tiles in first dimension */), - m_tile_N1((arg_layout.dimension[1] + MASK_1) >> SHIFT_1), - m_tile_N2((VORank > 2) ? (arg_layout.dimension[2] + MASK_2) >> SHIFT_2 - : 0), - m_tile_N3((VORank > 3) ? (arg_layout.dimension[3] + MASK_3) >> SHIFT_3 - : 0), - m_tile_N4((VORank > 4) ? (arg_layout.dimension[4] + MASK_4) >> SHIFT_4 - : 0), - m_tile_N5((VORank > 5) ? (arg_layout.dimension[5] + MASK_5) >> SHIFT_5 - : 0), - m_tile_N6((VORank > 6) ? (arg_layout.dimension[6] + MASK_6) >> SHIFT_6 - : 0), - m_tile_N7((VORank > 7) ? (arg_layout.dimension[7] + MASK_7) >> SHIFT_7 - : 0) {} -}; - -// FIXME Remove the out-of-class definitions when we require C++17 -#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE \ - std::enable_if_t<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \ - is_array_layout<Layout>::value && \ - is_array_layout_tiled<Layout>::value)> -template <class Dimension, class Layout> -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern; -template <class Dimension, class Layout> -constexpr Kokkos::Iterate ViewOffset< - Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::inner_pattern; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::VORank; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_0; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_1; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_2; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_3; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_4; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_5; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_6; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_7; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_0; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_1; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_2; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_3; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_4; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_5; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_6; -template <class Dimension, class Layout> -constexpr int - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_7; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_2T; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_3T; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_4T; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_5T; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_6T; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_7T; -template <class Dimension, class Layout> -constexpr unsigned - ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_8T; -#undef KOKKOS_ITERATE_VIEW_OFFSET_ENABLE - -//---------------------------------------- - -// ViewMapping assign method needed in order to return a 'subview' tile as a -// proper View The outer iteration pattern determines the mapping of the pointer -// offset to the beginning of requested tile The inner iteration pattern is -// needed for the layout of the tile's View to be returned Rank 2 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1> -class ViewMapping<std::enable_if_t<(N2 == 0 && N3 == 0 && N4 == 0 && N5 == 0 && - N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, - N3, N4, N5, N6, N7, true>, - iType0, iType1> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T**, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = Kokkos::ViewTraits<T[N0][N1], array_layout, P...>; - using type = Kokkos::View<T[N0][N1], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left ? ((i_tile0 + src.m_impl_offset.m_tile_N0 * i_tile1) - << src_offset_type::SHIFT_2T) - : ((src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) - << src_offset_type::SHIFT_2T)) // offset to start - // of the tile - ), - dst_offset_type()); - } -}; - -// Rank 3 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1, typename iType2> -class ViewMapping<std::enable_if_t<(N3 == 0 && N4 == 0 && N5 == 0 && N6 == 0 && - N7 == 0)> // void - , - Kokkos::ViewTraits< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, - N3, N4, N5, N6, N7, true>, - iType0, iType1, iType2> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T***, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = Kokkos::ViewTraits<T[N0][N1][N2], array_layout, P...>; - using type = Kokkos::View<T[N0][N1][N2], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * i_tile2)) - << src_offset_type::SHIFT_3T) - : ((src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) + - i_tile2) - << src_offset_type::SHIFT_3T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 4 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1, typename iType2, typename iType3> -class ViewMapping< - std::enable_if_t<(N4 == 0 && N5 == 0 && N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T****, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, - N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>, - iType0, iType1, iType2, iType3> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T****, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = Kokkos::ViewTraits<T[N0][N1][N2][N3], array_layout, P...>; - using type = Kokkos::View<T[N0][N1][N2][N3], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + src.m_impl_offset.m_tile_N1 * - (i_tile2 + src.m_impl_offset.m_tile_N2 * - i_tile3))) - << src_offset_type::SHIFT_4T) - : ((src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) - << src_offset_type::SHIFT_4T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 5 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1, typename iType2, typename iType3, typename iType4> -class ViewMapping<std::enable_if_t<(N5 == 0 && N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, - N3, N4, N5, N6, N7, true>, - iType0, iType1, iType2, iType3, iType4> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T*****, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = Kokkos::ViewTraits<T[N0][N1][N2][N3][N4], array_layout, P...>; - using type = Kokkos::View<T[N0][N1][N2][N3][N4], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * i_tile4)))) - << src_offset_type::SHIFT_5T) - : ((src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) - << src_offset_type::SHIFT_5T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 6 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1, typename iType2, typename iType3, typename iType4, - typename iType5> -class ViewMapping<std::enable_if_t<(N6 == 0 && N7 == 0)> // void - , - Kokkos::ViewTraits< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, - N3, N4, N5, N6, N7, true>, - iType0, iType1, iType2, iType3, iType4, iType5> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T******, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = - Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5], array_layout, P...>; - using type = Kokkos::View<T[N0][N1][N2][N3][N4][N5], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + src.m_impl_offset.m_tile_N4 * - i_tile5))))) - << src_offset_type::SHIFT_6T) - : ((src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) - << src_offset_type::SHIFT_6T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 7 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1, typename iType2, typename iType3, typename iType4, - typename iType5, typename iType6> -class ViewMapping<std::enable_if_t<(N7 == 0)> // void - , - Kokkos::ViewTraits< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, - N3, N4, N5, N6, N7, true>, - iType0, iType1, iType2, iType3, iType4, iType5, iType6> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T*******, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = - Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>; - using type = Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - i_tile6)))))) - << src_offset_type::SHIFT_7T) - : ((src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) - << src_offset_type::SHIFT_7T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -// Rank 8 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, - typename iType1, typename iType2, typename iType3, typename iType4, - typename iType5, typename iType6, typename iType7> -class ViewMapping< - std::enable_if_t<(N0 != 0 && N1 != 0 && N2 != 0 && N3 != 0 && N4 != 0 && - N5 != 0 && N6 != 0 && N7 != 0)> // void - , - Kokkos::ViewTraits< - T********, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, - N5, N6, N7, true>, - P...>, - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>, - iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7> { - public: - using src_layout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - using src_traits = Kokkos::ViewTraits<T********, src_layout, P...>; - - static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); - static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); - using array_layout = std::conditional_t<is_inner_left, Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using traits = - Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>; - using type = - Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>; - - KOKKOS_INLINE_FUNCTION static void assign( - ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, - const src_layout&, const iType0 i_tile0, const iType1 i_tile1, - const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, - const iType5 i_tile5, const iType6 i_tile6, const iType7 i_tile7) { - using dst_map_type = ViewMapping<traits, void>; - using src_map_type = ViewMapping<src_traits, void>; - using dst_handle_type = typename dst_map_type::handle_type; - using dst_offset_type = typename dst_map_type::offset_type; - using src_offset_type = typename src_map_type::offset_type; - - dst = dst_map_type( - dst_handle_type( - src.m_impl_handle + - (is_outer_left - ? ((i_tile0 + - src.m_impl_offset.m_tile_N0 * - (i_tile1 + - src.m_impl_offset.m_tile_N1 * - (i_tile2 + - src.m_impl_offset.m_tile_N2 * - (i_tile3 + - src.m_impl_offset.m_tile_N3 * - (i_tile4 + - src.m_impl_offset.m_tile_N4 * - (i_tile5 + - src.m_impl_offset.m_tile_N5 * - (i_tile6 + - src.m_impl_offset.m_tile_N6 * - i_tile7))))))) - << src_offset_type::SHIFT_8T) - : ((src.m_impl_offset.m_tile_N7 * - (src.m_impl_offset.m_tile_N6 * - (src.m_impl_offset.m_tile_N5 * - (src.m_impl_offset.m_tile_N4 * - (src.m_impl_offset.m_tile_N3 * - (src.m_impl_offset.m_tile_N2 * - (src.m_impl_offset.m_tile_N1 * - i_tile0 + - i_tile1) + - i_tile2) + - i_tile3) + - i_tile4) + - i_tile5) + - i_tile6) + - i_tile7) - << src_offset_type::SHIFT_8T))) // offset to start of the - // tile - , - dst_offset_type()); - } -}; - -} /* namespace Impl */ -} /* namespace Kokkos */ - -//---------------------------------------- - -namespace Kokkos { - -// Rank 2 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T**, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1], array_layout, P...>(src, SrcLayout(), i_tile0, - i_tile1); -} - -// Rank 3 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1][N2], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T***, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1][N2], array_layout, P...>( - src, SrcLayout(), i_tile0, i_tile1, i_tile2); -} - -// Rank 4 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1][N2][N3], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1][N2][N3], array_layout, P...>( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3); -} - -// Rank 5 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1][N2][N3][N4], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T*****, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1][N2][N3][N4], array_layout, P...>( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4); -} - -// Rank 6 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1][N2][N3][N4][N5], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1][N2][N3][N4][N5], array_layout, P...>( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5); -} - -// Rank 7 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T*******, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6); -} - -// Rank 8 -template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, - unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, - unsigned N5, unsigned N6, unsigned N7, class... P> -KOKKOS_INLINE_FUNCTION - Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6][N7], - std::conditional_t<(InnerP == Kokkos::Iterate::Left), - Kokkos::LayoutLeft, Kokkos::LayoutRight>, - P...> - tile_subview(const Kokkos::View< - T********, - Kokkos::Experimental::LayoutTiled< - OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, - P...>& src, - const size_t i_tile0, const size_t i_tile1, - const size_t i_tile2, const size_t i_tile3, - const size_t i_tile4, const size_t i_tile5, - const size_t i_tile6, const size_t i_tile7) { - // Force the specialized ViewMapping for extracting a tile - // by using the first subview argument as the layout. - using array_layout = - std::conditional_t<(InnerP == Kokkos::Iterate::Left), Kokkos::LayoutLeft, - Kokkos::LayoutRight>; - using SrcLayout = - Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, - N6, N7, true>; - - return Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>( - src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, - i_tile6, i_tile7); -} - -} /* namespace Kokkos */ -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWLAYOUTTILE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp b/packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp index f36e72e91451888139d48014d833b66134a0eeb6..a2090ac0dbc99358283cf560daa82de71c28db4a 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ZeroMemset_fwd.hpp @@ -20,7 +20,7 @@ namespace Kokkos { namespace Impl { -template <typename ExecutionSpace, class ViewType> +template <typename ExecutionSpace> struct ZeroMemset; } // namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp index 230322177e8adc81f442d11021557637d1dafd0c..4f280a80576d79469a65af657ee0af8b309dea84 100644 --- a/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp @@ -234,7 +234,7 @@ namespace { inline void print_bitmap(std::ostream& s, const hwloc_const_bitmap_t bitmap) { s << "{"; - for (int i = hwloc_bitmap_first(bitmap); - 1 != i; + for (int i = hwloc_bitmap_first(bitmap); -1 != i; i = hwloc_bitmap_next(bitmap, i)) { s << " " << i; } diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp index 1130485e841d9ba37e28044ac37df35d9d108afc..a29981da48f6990357007d83213949c4e652becd 100644 --- a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -56,6 +56,8 @@ #define KOKKOS_LAMBDA [=] __host__ __device__ #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#define KOKKOS_DEDUCTION_GUIDE __host__ __device__ + #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_FORCEINLINE __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline @@ -63,4 +65,12 @@ #define KOKKOS_IMPL_HOST_FUNCTION __host__ #define KOKKOS_IMPL_DEVICE_FUNCTION __device__ +// clang-format off +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION __device__ __host__ +#else +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION requires Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON" +#endif +// clang-format on + #endif /* KOKKOS_CUDA_SETUP_HPP_ */ diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp index 7b0186610707a1b561fdee79d100b9ba7d5d682e..6c333e589df6a334386d448ae3d5721d79220705 100644 --- a/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp @@ -27,12 +27,31 @@ #define KOKKOS_LAMBDA [=] __host__ __device__ #define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#define KOKKOS_DEDUCTION_GUIDE __host__ __device__ + #define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ #define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline #define KOKKOS_IMPL_FUNCTION __device__ __host__ #define KOKKOS_IMPL_HOST_FUNCTION __host__ #define KOKKOS_IMPL_DEVICE_FUNCTION __device__ +// clang-format off +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION __device__ __host__ +#else +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION requires Kokkos_ENABLE_HIP_RELOCATABLE_DEVICE_CODE=ON" +#endif +// clang-format on + +// The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. +#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#define KOKKOS_IMPL_HIP_NATIVE_GRAPH +#endif + +#ifdef KOKKOS_ARCH_AMD_GFX942_APU +#define KOKKOS_IMPL_HIP_UNIFIED_MEMORY +#endif + #endif // #if defined( KOKKOS_ENABLE_HIP ) #endif diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp index 7f7957bc61f21a337179273ed9f882f93a969d63..e203747fb60b36519e7eba35a845041f1ba664cf 100644 --- a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -38,12 +38,36 @@ #include <CL/sycl.hpp> #endif -#ifdef __SYCL_DEVICE_ONLY__ -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ - do { \ - const __attribute__((opencl_constant)) char fmt[] = (format); \ - sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \ - } while (0) +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20230200 +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) \ + accessor.get_multi_ptr<sycl::access::decorated::yes>() +#else +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() +#endif + +// FIXME_SYCL Use type directly once it has stabilized in SYCL. +namespace Kokkos::Impl { +#ifndef SYCL_EXT_INTEL_USM_ADDRESS_SPACES +#error SYCL_EXT_INTEL_USM_ADDRESS_SPACES undefined! +#elif SYCL_EXT_INTEL_USM_ADDRESS_SPACES >= 2 +template <typename T> +using sycl_device_ptr = sycl::ext::intel::device_ptr<T>; +template <typename T> +using sycl_host_ptr = sycl::ext::intel::host_ptr<T>; +#else +template <typename T> +using sycl_device_ptr = sycl::device_ptr<T>; +template <typename T> +using sycl_host_ptr = sycl::host_ptr<T>; +#endif +} // namespace Kokkos::Impl + +// clang-format off +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION SYCL_EXTERNAL +#else +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION requires Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=ON" #endif +// clang-format on #endif diff --git a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp index 91820fbccacfdf36f5e2aea36f056a915681d6a3..037cc6c77b71cd351fb009c5e7c6f8ef36dfe19d 100644 --- a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp @@ -83,7 +83,7 @@ struct IndexTypePolicyMixin : AnalyzeNextTrait { "Kokkos Error: More than one index type given. Search " "compiler output for 'show_extra_index_type' to see the " "type of the errant tag."); - static_assert(std::is_integral<IntegralIndexType>::value, ""); + static_assert(std::is_integral_v<IntegralIndexType>); static constexpr bool index_type_is_defaulted = false; using index_type = Kokkos::IndexType<IntegralIndexType>; }; @@ -101,8 +101,8 @@ struct PolicyTraitMatcher<IndexTypeTrait, IndexType<IntegralIndexType>> template <class IntegralIndexType> struct PolicyTraitMatcher< IndexTypeTrait, IntegralIndexType, - std::enable_if_t<std::is_integral<IntegralIndexType>::value>> - : std::true_type {}; + std::enable_if_t<std::is_integral_v<IntegralIndexType>>> : std::true_type { +}; // </editor-fold> end PolicyTraitMatcher specialization"> }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp index 2949d969ee01b2aefeca8ad14aebaf162ba9ae8b..566d81a869771c2d73c85714ac74860bd79d44ff 100644 --- a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp @@ -47,7 +47,7 @@ struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> { show_extra_iteration_pattern_erroneously_given_to_execution_policy< typename base_t::iteration_pattern>{}; static_assert( - std::is_void<typename base_t::iteration_pattern>::value, + std::is_void_v<typename base_t::iteration_pattern>, "Kokkos Error: More than one index type given. Search " "compiler output for 'show_extra_iteration_pattern' to see the " "type of the errant tag."); diff --git a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp index dadf582c3728fb0b241514b4af21ffd9b0ea1c1f..f7d2673bce79bb546c08d5ffb38645fb14abb3b7 100644 --- a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -34,11 +34,14 @@ struct MaximizeOccupancy; struct DesiredOccupancy { int m_occ = 100; - explicit constexpr DesiredOccupancy(int occ) : m_occ(occ) { + bool tune; + explicit constexpr DesiredOccupancy(int occ) : m_occ(occ), tune(false) { KOKKOS_EXPECTS(0 <= occ && occ <= 100); } + explicit constexpr DesiredOccupancy(const Kokkos::AUTO_t) : tune(true) {} explicit constexpr operator int() const { return m_occ; } constexpr int value() const { return m_occ; } + constexpr bool should_tune() const { return tune; } DesiredOccupancy() = default; explicit DesiredOccupancy(MaximizeOccupancy const&) : DesiredOccupancy() {} }; @@ -75,8 +78,8 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> { OccupancyControlPolicyMixin<OccControl, AnalyzeNextTrait>; template <class T> using trait_matches_specification = std::bool_constant< - std::is_same<T, Kokkos::Experimental::DesiredOccupancy>::value || - std::is_same<T, Kokkos::Experimental::MaximizeOccupancy>::value>; + std::is_same_v<T, Kokkos::Experimental::DesiredOccupancy> || + std::is_same_v<T, Kokkos::Experimental::MaximizeOccupancy>>; }; // </editor-fold> end Occupancy control trait specification }}}1 @@ -163,7 +166,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) { template <typename Policy> constexpr auto prefer(Policy const& p, MaximizeOccupancy) { - static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + static_assert(Kokkos::is_execution_policy<Policy>::value); using new_policy_t = Kokkos::Impl::OccupancyControlTrait::policy_with_trait<Policy, MaximizeOccupancy>; diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp index 578e9e762adb7c0f2eef9fdd28f0aa0c80edb5d1..98ad1d7ebbbac73e2c2502d052ea56291236e5df 100644 --- a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl< TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>, type_list<MatchingTrait, ToProcessTraits...>, NewTrait, std::enable_if_t<PolicyTraitMatcher<TraitSpec, MatchingTrait>::value>> { - static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, ""); + static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value); using type = PolicyTemplate<ProcessedTraits..., NewTrait, ToProcessTraits...>; }; @@ -92,7 +92,7 @@ template <class TraitSpec, template <class...> class PolicyTemplate, struct PolicyTraitAdaptorImpl<TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>, type_list<>, NewTrait> { - static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, ""); + static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value); using type = PolicyTemplate<ProcessedTraits..., NewTrait>; }; diff --git a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp index 86130025530516d4a8a1937b60310fb443943606..4e91d89f0f9c8b5a76d69df126ab85472d0683b6 100644 --- a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -78,7 +78,7 @@ namespace Experimental { template <class Policy, class ScheduleType> constexpr auto require(Policy const& p, Kokkos::Schedule<ScheduleType>) { - static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + static_assert(Kokkos::is_execution_policy<Policy>::value); using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< Policy, Kokkos::Schedule<ScheduleType>>; return new_policy_t{p}; diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp index 8f95385c851748a1e578edc54e8875672128a9ce..ae7aa6e534fd1f3de403392074e843a8d3d958b9 100644 --- a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -57,7 +57,7 @@ namespace Experimental { template <class Policy, unsigned long Property> constexpr auto require(const Policy p, WorkItemProperty::ImplWorkItemProperty<Property>) { - static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + static_assert(Kokkos::is_execution_policy<Policy>::value); using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< Policy, WorkItemProperty::ImplWorkItemProperty<Property>>; return new_policy_t{p}; diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp index 6eda78f838dc9a9ac914557e8c12be573b9528fb..b25e15dc1524e4eebbdecb3d26bd87dfcd3a6dcf 100644 --- a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp @@ -60,7 +60,7 @@ struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> { show_extra_work_tag_erroneously_given_to_execution_policy< typename base_t::work_tag>{}; static_assert( - std::is_void<typename base_t::work_tag>::value, + std::is_void_v<typename base_t::work_tag>, "Kokkos Error: More than one work tag given. Search compiler output " "for 'show_extra_work_tag' to see the type of the errant tag."); }; @@ -81,7 +81,7 @@ struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> { // we should benchmark this assumption if it becomes a problem. template <class T> using trait_matches_specification = std::bool_constant< - std::is_empty<T>::value && + std::is_empty_v<T> && !type_list_any<_trait_matches_spec_predicate<T>::template apply, _exec_policy_traits_without_work_tag>::value>; }; diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt index b71c72c3c9f7f96545ff273f53d4066dfe507167..b9029b8d29721d1fdd068692567c0868cd9b92f0 100644 --- a/packages/kokkos/core/unit_test/CMakeLists.txt +++ b/packages/kokkos/core/unit_test/CMakeLists.txt @@ -2,113 +2,129 @@ # Add test-only library for gtest to be reused by all the subpackages # -IF(NOT GTest_FOUND) # fallback to internal gtest - SET(GTEST_SOURCE_DIR ${Kokkos_SOURCE_DIR}/tpls/gtest) - - #need here for tribits - KOKKOS_INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) - KOKKOS_ADD_TEST_LIBRARY( - kokkos_gtest - HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h - SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc +if(NOT GTest_FOUND) # fallback to internal gtest + set(GTEST_SOURCE_DIR ${Kokkos_SOURCE_DIR}/tpls/gtest) + + kokkos_add_test_library( + kokkos_gtest HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc ) - TARGET_INCLUDE_DIRECTORIES(kokkos_gtest PUBLIC ${GTEST_SOURCE_DIR}) - IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - TARGET_COMPILE_FEATURES(kokkos_gtest PUBLIC cxx_std_14) - ENDIF() + target_include_directories(kokkos_gtest SYSTEM PUBLIC ${GTEST_SOURCE_DIR}) + if((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + target_compile_features(kokkos_gtest PUBLIC cxx_std_14) + endif() # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(kokkos_gtest PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(kokkos_gtest PROPERTIES CXX_CLANG_TIDY "") + endif() - FIND_PACKAGE(Threads QUIET) - IF(TARGET Threads::Threads) - SET_TARGET_PROPERTIES(kokkos_gtest PROPERTIES - INTERFACE_LINK_LIBRARIES Threads::Threads) - ENDIF() -ENDIF() + find_package(Threads QUIET) + if(TARGET Threads::Threads) + set_target_properties(kokkos_gtest PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads) + endif() +endif() # # Define Incremental Testing Feature Levels # Define Device name mappings (i.e. what comes after Kokkos:: for the ExecSpace) # -SET(KOKKOS_CUDA_FEATURE_LEVEL 999) -SET(KOKKOS_CUDA_NAME Cuda) -SET(KOKKOS_HIP_FEATURE_LEVEL 999) -SET(KOKKOS_HIP_NAME HIP) -SET(KOKKOS_HPX_FEATURE_LEVEL 999) -SET(KOKKOS_HPX_NAME Experimental::HPX) -SET(KOKKOS_OPENMP_FEATURE_LEVEL 999) -SET(KOKKOS_OPENMP_NAME OpenMP) +set(KOKKOS_CUDA_FEATURE_LEVEL 999) +set(KOKKOS_CUDA_NAME Cuda) +set(KOKKOS_HIP_FEATURE_LEVEL 999) +set(KOKKOS_HIP_NAME HIP) +set(KOKKOS_HPX_FEATURE_LEVEL 999) +set(KOKKOS_HPX_NAME Experimental::HPX) +set(KOKKOS_OPENMP_FEATURE_LEVEL 999) +set(KOKKOS_OPENMP_NAME OpenMP) # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend. # FIXME_OPENMPTARGET - Clang version 17 fails to compile incremental tests past 12 with verion 17. There is PR for this in upstream already. So it should be fixed by version 18. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.0.0) - SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 12) -ELSE() - SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14) -ENDIF() - -SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget) -SET(KOKKOS_SERIAL_FEATURE_LEVEL 999) -SET(KOKKOS_SERIAL_NAME Serial) -SET(KOKKOS_SYCL_FEATURE_LEVEL 999) -SET(KOKKOS_SYCL_NAME Experimental::SYCL) -SET(KOKKOS_THREADS_FEATURE_LEVEL 999) -SET(KOKKOS_THREADS_NAME Threads) -# FIXME_OPENACC - The Clang compiler only compiles the first 9 incremental tests for the OpenACC backend. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(KOKKOS_OPENACC_FEATURE_LEVEL 9) -ELSE() - SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.0.0) + set(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 12) +else() + set(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14) +endif() -SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) +set(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget) +set(KOKKOS_SERIAL_FEATURE_LEVEL 999) +set(KOKKOS_SERIAL_NAME Serial) +set(KOKKOS_SYCL_FEATURE_LEVEL 999) +set(KOKKOS_SYCL_NAME SYCL) +set(KOKKOS_THREADS_FEATURE_LEVEL 999) +set(KOKKOS_THREADS_NAME Threads) +# FIXME_OPENACC - The Clang compiler only compiles the first 9 incremental tests for the OpenACC backend. +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(KOKKOS_OPENACC_FEATURE_LEVEL 9) +else() + set(KOKKOS_OPENACC_FEATURE_LEVEL 17) +endif() +set(KOKKOS_OPENACC_NAME Experimental::OpenACC) # # Define the tests # #I will leave these alone for now because I don't need transitive dependencies on tests -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - -SET(COMPILE_ONLY_SOURCES - TestArray.cpp - TestCreateMirror.cpp - TestDetectionIdiom.cpp - TestBitManipulation.cpp - TestInterOp.cpp - TestStringManipulation.cpp - TestVersionMacros.cpp - TestViewRank.cpp - TestViewTypeTraits.cpp - TestTypeList.cpp - view/TestExtentsDatatypeConversion.cpp +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) + +set(COMPILE_ONLY_SOURCES + TestArray.cpp + TestCreateMirror.cpp + TestDetectionIdiom.cpp + TestBitManipulation.cpp + TestInterOp.cpp + TestRangePolicyCTAD.cpp + TestStringManipulation.cpp + TestVersionMacros.cpp + TestViewRank.cpp + TestViewTypeTraits.cpp + TestViewTypedefs.cpp + TestTypeInfo.cpp + TestTypeList.cpp + TestMDRangePolicyCTAD.cpp + TestTeamPolicyCTAD.cpp + TestTeamMDRangePolicyCTAD.cpp + TestNestedReducerCTAD.cpp + view/TestBasicViewMDSpanConversion.cpp + view/TestExtentsDatatypeConversion.cpp ) +if(NOT Kokkos_ENABLE_IMPL_MDSPAN OR KOKKOS_CXX_COMPILER_ID STREQUAL "Intel") + list(REMOVE_ITEM COMPILE_ONLY_SOURCES view/TestBasicViewMDSpanConversion.cpp) +endif() + #testing if windows.h and Kokkos_Core.hpp can be included if(WIN32) - LIST(APPEND COMPILE_ONLY_SOURCES TestWindowsInclude.cpp) + list(APPEND COMPILE_ONLY_SOURCES TestWindowsInclude.cpp) endif() -#TestInterOp has a dependency on containers -IF(KOKKOS_HAS_TRILINOS) - LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp) -ENDIF() -KOKKOS_ADD_EXECUTABLE( - CoreTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} -) +if(Kokkos_INSTALL_TESTING) # FIXME Kokkos_ and KOKKOS_ variables are out of sync + if((Kokkos_CXX_COMPILER_ID STREQUAL "Intel" AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (Kokkos_CXX_COMPILER_ID STREQUAL "NVIDIA" AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (Kokkos_CXX_COMPILER_ID STREQUAL "NVIDIA" AND Kokkos_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + ) + list(REMOVE_ITEM COMPILE_ONLY_SOURCES TestTypeInfo.cpp) + endif() +else() + if((KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + ) + list(REMOVE_ITEM COMPILE_ONLY_SOURCES TestTypeInfo.cpp) + endif() +endif() + +if(Kokkos_ENABLE_OPENMPTARGET) + list(REMOVE_ITEM COMPILE_ONLY_SOURCES TestNestedReducerCTAD.cpp) +endif() +kokkos_add_executable(CoreTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -121,57 +137,57 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) # command line in an intermediate compilation step even if CMake generated a response # file. That then exceeded the shell command line max length. set(${Tag}_SOURCES1A) - foreach(Name - Abort - ArrayOps - AtomicOperations_complexdouble - AtomicOperations_complexfloat - AtomicOperations_double - AtomicOperations_float - AtomicOperations_int - AtomicOperations_longint - AtomicOperations_longlongint - AtomicOperations_shared - AtomicOperations_unsignedint - AtomicOperations_unsignedlongint - Atomics - AtomicViews - BitManipulationBuiltins - BlockSizeDeduction - CheckedIntegerOps - CommonPolicyConstructors - CommonPolicyInterface - Complex - Concepts - Crs - DeepCopyAlignment - ExecSpacePartitioning - ExecutionSpace - FunctorAnalysis - HostSharedPtr - HostSharedPtrAccessOnDevice - Init - JoinBackwardCompatibility - LocalDeepCopy - MathematicalConstants - MathematicalFunctions1 - MathematicalFunctions2 - MathematicalFunctions3 - MathematicalSpecialFunctions - ) + foreach( + Name + Abort + ArrayOps + AtomicOperations_complexdouble + AtomicOperations_complexfloat + AtomicOperations_double + AtomicOperations_float + AtomicOperations_int + AtomicOperations_longint + AtomicOperations_longlongint + AtomicOperations_shared + AtomicOperations_unsignedint + AtomicOperations_unsignedlongint + Atomics + AtomicViews + BitManipulationBuiltins + BlockSizeDeduction + CheckedIntegerOps + CommonPolicyConstructors + CommonPolicyInterface + Complex + Concepts + Crs + DeepCopyAlignment + ExecSpacePartitioning + ExecSpaceThreadSafety + ExecutionSpace + FunctorAnalysis + Graph + HostSharedPtr + HostSharedPtrAccessOnDevice + Init + JoinBackwardCompatibility + LocalDeepCopy + MathematicalConstants + MathematicalFunctions1 + MathematicalFunctions2 + MathematicalFunctions3 + MathematicalSpecialFunctions + ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${Tag}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${Tag}_SOURCES1A ${file}) endforeach() set(${Tag}_SOURCES1B) - foreach(Name + set(${Tag}_TESTNAMES1B MDRange_a MDRange_b MDRange_c @@ -182,8 +198,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDRangePolicyConstructors MDRangeReduce MDSpan + MDSpanAtomicAccessor + MDSpanConversion MinMaxClamp NumericTraits + OccupancyControlTrait Other ParallelScanRangePolicy Printf @@ -200,63 +219,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reductions Reductions_DeviceView SharedAlloc - ) + SpaceAwareAccessorAccessViolation + SpaceAwareAccessor + Swap + ) + if(NOT Kokkos_ENABLE_IMPL_MDSPAN) + list(REMOVE_ITEM ${Tag}_TESTNAMES1B MDSpanAtomicAccessor MDSpanConversion SpaceAwareAccessorAccessViolation + SpaceAwareAccessor + ) + endif() + # This test case causes MSVC to fail with "number of sections exceeded object file format limit" + if(MSVC) + list(REMOVE_ITEM ${Tag}_TESTNAMES1B Reducers_d) + endif() + foreach(Name IN LISTS ${Tag}_TESTNAMES1B) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${Tag}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${Tag}_SOURCES1B ${file}) endforeach() - SET(${Tag}_SOURCES2A) - foreach(Name - TeamBasic - TeamCombinedReducers - TeamMDRange - TeamPolicyConstructors - TeamReductionScan - TeamScan - TeamScratch - TeamTeamSize - TeamVectorRange - UniqueToken - View_64bit - ViewAPI_a - ViewAPI_b - ViewAPI_c - ViewAPI_d - ViewAPI_e - ViewCopy_a - ViewCopy_b - ViewCtorDimMatch - ViewHooks - ViewLayoutStrideAssignment - ViewMapping_a - ViewMapping_b - ViewMapping_subview - ViewMemoryAccessViolation - ViewOfClass - ViewResize - WorkGraph - WithoutInitializing - ) + set(${Tag}_SOURCES2A) + set(${Tag}_TESTNAMES2A + TeamBasic + TeamCombinedReducers + TeamMDRange + TeamPolicyConstructors + TeamReductionScan + TeamScan + TeamScratch + TeamTeamSize + TeamVectorRange + UniqueToken + View_64bit + ViewAPI_a + ViewAPI_b + ViewAPI_c + ViewAPI_d + ViewAPI_e + ViewBadAlloc + ViewCopy_a + ViewCopy_b + ViewCopy_c + ViewCtorDimMatch + ViewCtorProp + ViewEmptyRuntimeUnmanaged + ViewHooks + ViewLayoutStrideAssignment + ViewMapping_a + ViewMapping_b + ViewMapping_subview + ViewMemoryAccessViolation + ViewOfClass + ViewOfViews + ViewOutOfBoundsAccess + ViewResize + WorkGraph + WithoutInitializing + ) + # Workaround to internal compiler error with intel classic compilers + # when using -no-ip flag in ViewCopy_c + # See issue: https://github.com/kokkos/kokkos/issues/7084 + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM ${Tag}_TESTNAMES2A ViewCopy_c) + endif() + foreach(Name IN LISTS ${Tag}_TESTNAMES2A) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${Tag}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${Tag}_SOURCES2A ${file}) endforeach() set(TagHostAccessible ${Tag}) - if (Tag STREQUAL "Cuda") + if(Tag STREQUAL "Cuda") set(TagHostAccessible CudaUVM) elseif(Tag STREQUAL "HIP") set(TagHostAccessible HIPManaged) @@ -265,7 +304,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) endif() set(${Tag}_SOURCES2B) - foreach(Name + foreach( + Name SubView_a SubView_b SubView_c01 @@ -273,58 +313,56 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) SubView_c03 SubView_c04 SubView_c05 - ) + ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${TagHostAccessible}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${TagHostAccessible}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${Tag}_SOURCES2B ${file}) endforeach() set(${Tag}_SOURCES2C) - foreach(Name - SubView_c06 - SubView_c07 - SubView_c08 - SubView_c09 - ) + foreach(Name SubView_c06 SubView_c07 SubView_c08 SubView_c09) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${TagHostAccessible}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${TagHostAccessible}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${Tag}_SOURCES2C ${file}) endforeach() set(${Tag}_SOURCES2D) - foreach(Name - SubView_c10 - SubView_c11 - SubView_c12 - SubView_c13 - SubView_c14 - ) + foreach(Name SubView_c10 SubView_c11 SubView_c12 SubView_c13 SubView_c14) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${TagHostAccessible}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${TagHostAccessible}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${Tag}_SOURCES2D ${file}) endforeach() - SET(${Tag}_SOURCES1 ${${Tag}_SOURCES1A} ${${Tag}_SOURCES1B}) - SET(${Tag}_SOURCES2 ${${Tag}_SOURCES2A} ${${Tag}_SOURCES2B} ${${Tag}_SOURCES2C} ${${Tag}_SOURCES2D}) - SET(${Tag}_SOURCES ${${Tag}_SOURCES1} ${${Tag}_SOURCES2}) + set(${Tag}_SOURCES1 ${${Tag}_SOURCES1A} ${${Tag}_SOURCES1B}) + set(${Tag}_SOURCES2 ${${Tag}_SOURCES2A} ${${Tag}_SOURCES2B} ${${Tag}_SOURCES2C} ${${Tag}_SOURCES2D}) + set(${Tag}_SOURCES ${${Tag}_SOURCES1} ${${Tag}_SOURCES2}) + + # ViewSupport should eventually contain the new implementation + # detail tests for the mdspan based View + set(${Tag}_VIEWSUPPORT) + if(Kokkos_ENABLE_IMPL_MDSPAN) + foreach(Name BasicView ReferenceCountedAccessor ReferenceCountedDataHandle) + set(file ${dir}/Test${Tag}_View_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp "#include <Test${Tag}_Category.hpp>\n" "#include <view/Test${Name}.hpp>\n") + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_VIEWSUPPORT ${file}) + endforeach() + if(KOKKOS_CXX_COMPILER_ID STREQUAL "Intel") + list(REMOVE_ITEM ${Tag}_VIEWSUPPORT ${dir}/Test${Tag}_View_BasicView.cpp) + endif() + kokkos_add_executable_and_test(CoreUnitTest_${Tag}_ViewSupport SOURCES UnitTestMainInit.cpp ${${Tag}_VIEWSUPPORT}) + endif() endif() endforeach() @@ -338,7 +376,8 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL if(Kokkos_ENABLE_${UPPER_DEVICE}) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name + foreach( + Name SharedAlloc ViewAPI_a ViewAPI_b @@ -347,17 +386,15 @@ foreach(PairDeviceSpace HIP-HostPinned;HIP-Managed;Cuda-HostPinned;Cuda-UVM;SYCL ViewAPI_e ViewCopy_a ViewCopy_b + ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview - ) + ) set(file ${dir}/Test${DEVICE}${SPACE}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include <Test${DEVICE}${SPACE}_Category.hpp>\n" - "#include <Test${Name}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include <Test${DEVICE}${SPACE}_Category.hpp>\n" "#include <Test${Name}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ${DEVICE}_SOURCES3 ${file}) endforeach() @@ -367,53 +404,60 @@ endforeach() # Disable non-compiling tests based on clang version. if(Kokkos_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM OpenMPTarget_SOURCES + list( + REMOVE_ITEM + OpenMPTarget_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp - IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp - endif() - IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp - endif() + IF + (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp + IF + (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.3) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + endif + () + endif + () # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler. # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexdouble.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SharedAlloc.cpp - ENDIF() - ) + IF + (KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexdouble.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SharedAlloc.cpp + ENDIF + () + ) endif() # FIXME_OPENMPTARGET - MinMaxClamp fails even with the host backend when OpenMPTarget backend is enabled. # FIXME_OPENMPTARGET - Unsure of the reason as of now. -IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - IF(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_OPENMP) - list(REMOVE_ITEM OpenMP_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_MinMaxClamp.cpp - ) - ENDIF() - IF(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_SERIAL) - list(REMOVE_ITEM Serial_SOURCES1 - ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_MinMaxClamp.cpp - ) - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) + if(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_OPENMP) + list(REMOVE_ITEM OpenMP_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openmp/TestOpenMP_MinMaxClamp.cpp) + endif() + if(Kokkos_ENABLE_OPENMPTARGET AND Kokkos_ENABLE_SERIAL) + list(REMOVE_ITEM Serial_SOURCES1 ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_MinMaxClamp.cpp) + endif() +endif() if(Kokkos_ENABLE_OPENACC) - list(REMOVE_ITEM OpenACC_SOURCES + list( + REMOVE_ITEM + OpenACC_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp @@ -434,25 +478,23 @@ endif() # FIXME_OPENMPTARGET - Comment non-passing tests with amdclang++ # FIXME_OPENMPTARGET - Need to check on GFX1030 and GFX1100 architectures -IF(KOKKOS_ARCH_VEGA) - SET(KOKKOS_AMDGPU_ARCH TRUE) -ENDIF() -IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_AMDGPU_ARCH) - LIST(REMOVE_ITEM OpenMPTarget_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_e.cpp - ) -ENDIF() +if(KOKKOS_ARCH_VEGA) + set(KOKKOS_AMDGPU_ARCH TRUE) +endif() +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_AMDGPU_ARCH) + list(REMOVE_ITEM OpenMPTarget_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_e.cpp) +endif() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. -IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - LIST(REMOVE_ITEM OpenMPTarget_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp - ) -ENDIF() +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) + list(REMOVE_ITEM OpenMPTarget_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp) +endif() # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++ -IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - list(REMOVE_ITEM OpenMPTarget_SOURCES +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list( + REMOVE_ITEM + OpenMPTarget_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations.cpp @@ -514,25 +556,17 @@ IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_f.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewResize.cpp - ) + ) endif() # FIXME_OPENACC - Comment non-passing tests with the NVIDIA HPC compiler nvc++ -IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - list(REMOVE_ITEM OpenACC_SOURCES +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list( + REMOVE_ITEM + OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp @@ -549,24 +583,19 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp - ) + ) endif() # FIXME_OPENACC - Comment non-passing tests with the Clang compiler -IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - list(REMOVE_ITEM OpenACC_SOURCES +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list( + REMOVE_ITEM + OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp @@ -590,6 +619,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_c.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp @@ -616,340 +646,238 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMemoryAccessViolation.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WithoutInitializing.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp - ) + #Below test is disabled because it uses atomic operations not supported by Clacc. + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ExecSpaceThreadSafety.cpp + ) # When tested on a systme with AMD MI60 GPU and ROCm V5.4.0, these cause # clang-linker-wrapper to hang for a long time while building the unit tests. # In some cases, including them caused the build not to complete after an hour, # but excluding them permitted the build to finish in 1.5 mins or less. - IF(KOKKOS_AMDGPU_ARCH) - list(REMOVE_ITEM OpenACC_SOURCES - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BitManipulationBuiltins.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ParallelScanRangePolicy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c04.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c06.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c07.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c09.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c10.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp + if(KOKKOS_AMDGPU_ARCH) + list( + REMOVE_ITEM + OpenACC_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BitManipulationBuiltins.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ParallelScanRangePolicy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c04.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c06.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c07.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c09.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c10.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp ) endif() # Fails serial.atomics_tpetra_max_abs when we test with Clacc. - list(REMOVE_ITEM Serial_SOURCES1 - ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_Atomics.cpp) + list(REMOVE_ITEM Serial_SOURCES1 ${CMAKE_CURRENT_BINARY_DIR}/serial/TestSerial_Atomics.cpp) endif() if(Kokkos_ENABLE_SERIAL) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Serial1 - SOURCES - UnitTestMainInit.cpp - ${Serial_SOURCES1} - serial/TestSerial_Task.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Serial2 - SOURCES - UnitTestMainInit.cpp - ${Serial_SOURCES2} - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SerialGraph - SOURCES - UnitTestMainInit.cpp - serial/TestSerial_Graph.cpp + if(Kokkos_ENABLE_DEPRECATED_CODE_4) + set(Serial_EXTRA_SOURCES serial/TestSerial_Task.cpp) + else() + set(Serial_EXTRA_SOURCES) + endif() + + kokkos_add_executable_and_test( + CoreUnitTest_Serial1 SOURCES UnitTestMainInit.cpp ${Serial_SOURCES1} ${Serial_EXTRA_SOURCES} ) + kokkos_add_executable_and_test(CoreUnitTest_Serial2 SOURCES UnitTestMainInit.cpp ${Serial_SOURCES2}) endif() if(Kokkos_ENABLE_THREADS) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Threads - SOURCES ${Threads_SOURCES} - UnitTestMainInit.cpp - ) + kokkos_add_executable_and_test(CoreUnitTest_Threads SOURCES ${Threads_SOURCES} UnitTestMainInit.cpp) endif() -if (Kokkos_ENABLE_OPENMP) - set(OpenMP_EXTRA_SOURCES - openmp/TestOpenMP_Task.cpp - openmp/TestOpenMP_PartitionMaster.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMP - SOURCES - UnitTestMainInit.cpp - ${OpenMP_SOURCES} - ${OpenMP_EXTRA_SOURCES} - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPInterOp - SOURCES - UnitTestMain.cpp - openmp/TestOpenMP_InterOp.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPGraph - SOURCES - UnitTestMainInit.cpp - openmp/TestOpenMP_Graph.cpp +if(Kokkos_ENABLE_OPENMP) + if(Kokkos_ENABLE_DEPRECATED_CODE_4) + set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp) + else() + set(OpenMP_EXTRA_SOURCES) + endif() + kokkos_add_executable_and_test( + CoreUnitTest_OpenMP SOURCES UnitTestMainInit.cpp ${OpenMP_SOURCES} ${OpenMP_EXTRA_SOURCES} ) + kokkos_add_executable_and_test(CoreUnitTest_OpenMPInterOp SOURCES UnitTestMain.cpp openmp/TestOpenMP_InterOp.cpp) endif() if(Kokkos_ENABLE_HPX) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_HPX - SOURCES - UnitTestMainInit.cpp - ${HPX_SOURCES} - hpx/TestHPX_Task.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_HPXInterOp - SOURCES - UnitTestMain.cpp - hpx/TestHPX_InterOp.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( + if(Kokkos_ENABLE_DEPRECATED_CODE_4) + set(HPX_EXTRA_SOURCES hpx/TestHPX_Task.cpp) + else() + set(HPX_EXTRA_SOURCES) + endif() + + kokkos_add_executable_and_test(CoreUnitTest_HPX SOURCES UnitTestMainInit.cpp ${HPX_SOURCES} ${HPX_EXTRA_SOURCES}) + kokkos_add_executable_and_test(CoreUnitTest_HPXInterOp SOURCES UnitTestMain.cpp hpx/TestHPX_InterOp.cpp) + kokkos_add_executable_and_test( CoreUnitTest_HPX_IndependentInstances SOURCES - UnitTestMainInit.cpp - hpx/TestHPX_IndependentInstances.cpp - hpx/TestHPX_IndependentInstancesDelayedExecution.cpp - hpx/TestHPX_IndependentInstancesInstanceIds.cpp - hpx/TestHPX_IndependentInstancesRefCounting.cpp - hpx/TestHPX_IndependentInstancesSynchronization.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_HPX_InParallel - SOURCES - UnitTestMainInit.cpp - hpx/TestHPX_InParallel.cpp + UnitTestMainInit.cpp + hpx/TestHPX_IndependentInstances.cpp + hpx/TestHPX_IndependentInstancesDelayedExecution.cpp + hpx/TestHPX_IndependentInstancesInstanceIds.cpp + hpx/TestHPX_IndependentInstancesRefCounting.cpp + hpx/TestHPX_IndependentInstancesSynchronization.cpp ) + if(Kokkos_ENABLE_DEPRECATED_CODE_4) + kokkos_add_executable_and_test(CoreUnitTest_HPX_InParallel SOURCES UnitTestMainInit.cpp hpx/TestHPX_InParallel.cpp) + endif() endif() if(Kokkos_ENABLE_OPENMPTARGET) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenMPTarget - SOURCES - UnitTestMainInit.cpp - ${OpenMPTarget_SOURCES} - ) + kokkos_add_executable_and_test(CoreUnitTest_OpenMPTarget SOURCES UnitTestMainInit.cpp ${OpenMPTarget_SOURCES}) endif() if(Kokkos_ENABLE_OPENACC) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_OpenACC - SOURCES - UnitTestMainInit.cpp - ${OpenACC_SOURCES} - ) + kokkos_add_executable_and_test(CoreUnitTest_OpenACC SOURCES UnitTestMainInit.cpp ${OpenACC_SOURCES}) endif() if(Kokkos_ENABLE_CUDA) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Cuda1 - SOURCES - UnitTestMainInit.cpp - ${Cuda_SOURCES1} - cuda/TestCuda_ReducerViewSizeLimit.cpp - ) + kokkos_add_executable_and_test( + CoreUnitTest_Cuda1 SOURCES UnitTestMainInit.cpp ${Cuda_SOURCES1} cuda/TestCuda_ReducerViewSizeLimit.cpp + ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Cuda2 - SOURCES - UnitTestMainInit.cpp - ${Cuda_SOURCES2} - ) + kokkos_add_executable_and_test(CoreUnitTest_Cuda2 SOURCES UnitTestMainInit.cpp ${Cuda_SOURCES2}) + + if(Kokkos_ENABLE_DEPRECATED_CODE_4) + set(Cuda_EXTRA_SOURCES cuda/TestCuda_Task.cpp) + else() + set(Cuda_EXTRA_SOURCES) + endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST( + kokkos_add_executable_and_test( CoreUnitTest_Cuda3 SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_Task.cpp - cuda/TestCuda_TeamScratchStreams.cpp - ${Cuda_SOURCES3} - cuda/TestCuda_Spaces.cpp - ${Cuda_SOURCES_SHAREDSPACE} - ) + UnitTestMainInit.cpp + ${Cuda_EXTRA_SOURCES} + cuda/TestCuda_TeamScratchStreams.cpp + ${Cuda_SOURCES3} + cuda/TestCuda_Spaces.cpp + ${Cuda_SOURCES_SHAREDSPACE} + ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaTimingBased - SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_DebugSerialExecution.cpp - cuda/TestCuda_DebugPinUVMSpace.cpp + kokkos_add_executable_and_test( + CoreUnitTest_CudaTimingBased SOURCES UnitTestMainInit.cpp cuda/TestCuda_DebugSerialExecution.cpp + cuda/TestCuda_DebugPinUVMSpace.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaInterOpInit - SOURCES - UnitTestMain.cpp - cuda/TestCuda_InterOp_Init.cpp + kokkos_add_executable_and_test(CoreUnitTest_CudaInterOpInit SOURCES UnitTestMain.cpp cuda/TestCuda_InterOp_Init.cpp) + kokkos_add_executable_and_test( + CoreUnitTest_CudaInterOpStreams SOURCES UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaInterOpStreams - SOURCES - UnitTestMain.cpp - cuda/TestCuda_InterOp_Streams.cpp + kokkos_add_executable_and_test( + CoreUnitTest_CudaInterOpStreamsMultiGPU SOURCES UnitTestMainInit.cpp cuda/TestCuda_InterOp_StreamsMultiGPU.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CudaGraph - SOURCES - UnitTestMainInit.cpp - cuda/TestCuda_Graph.cpp + + kokkos_add_executable_and_test( + CoreUnitTest_CudaInterOpGraph SOURCES UnitTestMainInit.cpp cuda/TestCuda_InterOp_Graph.cpp ) endif() if(Kokkos_ENABLE_HIP) - KOKKOS_ADD_EXECUTABLE_AND_TEST( + kokkos_add_executable_and_test( CoreUnitTest_HIP SOURCES - UnitTestMainInit.cpp - ${HIP_SOURCES} - hip/TestHIP_ScanUnit.cpp - hip/TestHIP_Spaces.cpp - hip/TestHIP_Memory_Requirements.cpp - hip/TestHIP_TeamScratchStreams.cpp - hip/TestHIP_AsyncLauncher.cpp - hip/TestHIP_BlocksizeDeduction.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_HIPInterOpInit - SOURCES - UnitTestMain.cpp - hip/TestHIP_InterOp_Init.cpp + UnitTestMainInit.cpp + ${HIP_SOURCES} + hip/TestHIP_ScanUnit.cpp + hip/TestHIP_Spaces.cpp + hip/TestHIP_Memory_Requirements.cpp + hip/TestHIP_TeamScratchStreams.cpp + hip/TestHIP_AsyncLauncher.cpp + hip/TestHIP_BlocksizeDeduction.cpp + hip/TestHIP_UnifiedMemory_ZeroMemset.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_HIPInterOpStreams - SOURCES - UnitTestMain.cpp - hip/TestHIP_InterOp_Streams.cpp + kokkos_add_executable_and_test(CoreUnitTest_HIPInterOpInit SOURCES UnitTestMain.cpp hip/TestHIP_InterOp_Init.cpp) + kokkos_add_executable_and_test( + CoreUnitTest_HIPInterOpStreams SOURCES UnitTestMain.cpp hip/TestHIP_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_HIPGraph - SOURCES - UnitTestMainInit.cpp - hip/TestHIP_Graph.cpp + + kokkos_add_executable_and_test( + CoreUnitTest_HIPInterOpGraph SOURCES UnitTestMainInit.cpp hip/TestHIP_InterOp_Graph.cpp ) endif() if(Kokkos_ENABLE_SYCL) - list(REMOVE_ITEM SYCL_SOURCES2A - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp - ) + list(REMOVE_ITEM SYCL_SOURCES2A ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL1A - SOURCES - UnitTestMainInit.cpp - ${SYCL_SOURCES1A} - ) + kokkos_add_executable_and_test(CoreUnitTest_SYCL1A SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES1A}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL1B - SOURCES - UnitTestMainInit.cpp - ${SYCL_SOURCES1B} - ) + kokkos_add_executable_and_test(CoreUnitTest_SYCL1B SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES1B}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL2A - SOURCES - UnitTestMainInit.cpp - ${SYCL_SOURCES2A} - ) + kokkos_add_executable_and_test(CoreUnitTest_SYCL2A SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2A}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL2B - SOURCES - UnitTestMainInit.cpp - ${SYCL_SOURCES2B} - ) + kokkos_add_executable_and_test(CoreUnitTest_SYCL2B SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2B}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL2C - SOURCES - UnitTestMainInit.cpp - ${SYCL_SOURCES2C} - ) + kokkos_add_executable_and_test(CoreUnitTest_SYCL2C SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2C}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL2D - SOURCES - UnitTestMainInit.cpp - ${SYCL_SOURCES2D} - ) + kokkos_add_executable_and_test(CoreUnitTest_SYCL2D SOURCES UnitTestMainInit.cpp ${SYCL_SOURCES2D}) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCL3 - SOURCES - UnitTestMainInit.cpp - # FIXME_SYCL - sycl/TestSYCL_Task.cpp - sycl/TestSYCL_TeamScratchStreams.cpp - ${SYCL_SOURCES3} - sycl/TestSYCL_Spaces.cpp + kokkos_add_executable_and_test( + CoreUnitTest_SYCL3 SOURCES UnitTestMainInit.cpp sycl/TestSYCL_TeamScratchStreams.cpp ${SYCL_SOURCES3} + sycl/TestSYCL_Spaces.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCLInterOpInit - SOURCES - UnitTestMain.cpp - sycl/TestSYCL_InterOp_Init.cpp + kokkos_add_executable_and_test(CoreUnitTest_SYCLInterOpInit SOURCES UnitTestMain.cpp sycl/TestSYCL_InterOp_Init.cpp) + kokkos_add_executable_and_test( + CoreUnitTest_SYCLInterOpInit_Context SOURCES UnitTestMainInit.cpp sycl/TestSYCL_InterOp_Init_Context.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCLInterOpInit_Context - SOURCES - UnitTestMainInit.cpp - sycl/TestSYCL_InterOp_Init_Context.cpp + kokkos_add_executable_and_test( + CoreUnitTest_SYCLInterOpStreams SOURCES UnitTestMain.cpp sycl/TestSYCL_InterOp_Streams.cpp ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_SYCLInterOpStreams - SOURCES - UnitTestMain.cpp - sycl/TestSYCL_InterOp_Streams.cpp + kokkos_add_executable_and_test( + CoreUnitTest_SYCLInterOpStreamsMultiGPU SOURCES UnitTestMainInit.cpp sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp ) + + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) + kokkos_add_executable_and_test( + CoreUnitTest_SYCLInterOpGraph SOURCES UnitTestMainInit.cpp sycl/TestSYCL_InterOp_Graph.cpp + ) + endif() endif() -SET(DEFAULT_DEVICE_SOURCES - UnitTestMainInit.cpp - TestInitializationSettings.cpp - TestParseCmdLineArgsAndEnvVars.cpp - TestSharedSpace.cpp - TestSharedHostPinnedSpace.cpp - TestCompilerMacros.cpp - default/TestDefaultDeviceType.cpp - default/TestDefaultDeviceType_a1.cpp - default/TestDefaultDeviceType_b1.cpp - default/TestDefaultDeviceType_c1.cpp - default/TestDefaultDeviceType_a2.cpp - default/TestDefaultDeviceType_b2.cpp - default/TestDefaultDeviceType_c2.cpp - default/TestDefaultDeviceType_a3.cpp - default/TestDefaultDeviceType_b3.cpp - default/TestDefaultDeviceType_c3.cpp - default/TestDefaultDeviceType_d.cpp - default/TestDefaultDeviceTypeResize.cpp - default/TestDefaultDeviceTypeViewAPI.cpp +set(DEFAULT_DEVICE_SOURCES + UnitTestMainInit.cpp + TestCStyleMemoryManagement.cpp + TestInitializationSettings.cpp + TestParseCmdLineArgsAndEnvVars.cpp + TestSharedSpace.cpp + TestSharedHostPinnedSpace.cpp + TestCompilerMacros.cpp + default/TestDefaultDeviceType.cpp + default/TestDefaultDeviceType_a1.cpp + default/TestDefaultDeviceType_b1.cpp + default/TestDefaultDeviceType_c1.cpp + default/TestDefaultDeviceType_a2.cpp + default/TestDefaultDeviceType_b2.cpp + default/TestDefaultDeviceType_c2.cpp + default/TestDefaultDeviceType_a3.cpp + default/TestDefaultDeviceType_b3.cpp + default/TestDefaultDeviceType_c3.cpp + default/TestDefaultDeviceTypeResize.cpp + default/TestDefaultDeviceTypeViewAPI.cpp ) # FIXME_OPENMPTARGET and FIXME_OPENACC do not provide a MemorySpace that can be accessed from all ExecSpaces # FIXME_SYCL clock_tic does not give the correct timings for cloc_tic -if (KOKKOS_ENABLE_OPENACC OR KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_SYCL) - LIST(REMOVE_ITEM DEFAULT_DEVICE_SOURCES TestSharedSpace.cpp) +if(KOKKOS_ENABLE_OPENACC OR KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_SYCL) + list(REMOVE_ITEM DEFAULT_DEVICE_SOURCES TestSharedSpace.cpp) endif() # FIXME_OPENMPTARGET and FIXME_OPENACC do not provide a HostPinnedMemorySpace that can be accessed from all ExecSpaces -if (KOKKOS_ENABLE_OPENACC OR KOKKOS_ENABLE_OPENMPTARGET) - LIST(REMOVE_ITEM DEFAULT_DEVICE_SOURCES TestSharedHostPinnedSpace.cpp) +if(KOKKOS_ENABLE_OPENACC OR KOKKOS_ENABLE_OPENMPTARGET) + list(REMOVE_ITEM DEFAULT_DEVICE_SOURCES TestSharedHostPinnedSpace.cpp) endif() # FIXME_OPENMPTARGET, FIXME_OPENACC - Comment non-passing tests with the NVIDIA HPC compiler nvc++ -if ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM DEFAULT_DEVICE_SOURCES +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list( + REMOVE_ITEM + DEFAULT_DEVICE_SOURCES default/TestDefaultDeviceType_a1.cpp default/TestDefaultDeviceType_b1.cpp default/TestDefaultDeviceType_c1.cpp @@ -959,315 +887,273 @@ if ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILE default/TestDefaultDeviceType_a3.cpp default/TestDefaultDeviceType_b3.cpp default/TestDefaultDeviceType_c3.cpp - default/TestDefaultDeviceType_d.cpp default/TestDefaultDeviceTypeResize.cpp default/TestDefaultDeviceTypeViewAPI.cpp ) endif() # FIXME_OPENACC - Comment non-passing tests with the Clang compiler -if (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(DEFAULT_DEVICE_SOURCES - TestCompilerMacros.cpp - UnitTestMainInit.cpp - TestInitializationSettings.cpp - TestParseCmdLineArgsAndEnvVars.cpp - default/TestDefaultDeviceType_d.cpp +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list( + REMOVE_ITEM + DEFAULT_DEVICE_SOURCES + default/TestDefaultDeviceType_a1.cpp + default/TestDefaultDeviceType_b1.cpp + default/TestDefaultDeviceType_c1.cpp + default/TestDefaultDeviceType_a2.cpp + default/TestDefaultDeviceType_b2.cpp + default/TestDefaultDeviceType_c2.cpp + default/TestDefaultDeviceType_a3.cpp + default/TestDefaultDeviceType_b3.cpp + default/TestDefaultDeviceType_c3.cpp default/TestDefaultDeviceTypeResize.cpp default/TestDefaultDeviceTypeViewAPI.cpp ) endif() -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Default - SOURCES ${DEFAULT_DEVICE_SOURCES} -) +kokkos_add_executable_and_test(CoreUnitTest_Default SOURCES ${DEFAULT_DEVICE_SOURCES}) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_LegionInitialization - SOURCES - UnitTestMain.cpp - TestLegionInitialization.cpp -) +kokkos_add_executable_and_test(CoreUnitTest_LegionInitialization SOURCES UnitTestMain.cpp TestLegionInitialization.cpp) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_PushFinalizeHook - SOURCES - UnitTest_PushFinalizeHook.cpp -) +kokkos_add_executable_and_test(CoreUnitTest_PushFinalizeHook SOURCES UnitTest_PushFinalizeHook.cpp) + +kokkos_add_executable_and_test(CoreUnitTest_ScopeGuard SOURCES UnitTestMain.cpp UnitTest_ScopeGuard.cpp) # This test is intended for development and debugging by putting code # into TestDefaultDeviceDevelop.cpp. By default its empty. -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_Develop - SOURCES - UnitTestMainInit.cpp - default/TestDefaultDeviceDevelop.cpp -) +kokkos_add_executable_and_test(CoreUnitTest_Develop SOURCES UnitTestMainInit.cpp default/TestDefaultDeviceDevelop.cpp) + +# With MSVC, the terminate handler is called and prints the message but the +# program does not seem to exit and we get a timeout with ctest. +if(NOT WIN32) + # This test is special, because it passes exactly when it prints the + # message "PASSED: I am the custom std::terminate handler.", AND calls + # std::terminate. This means that we can't use + # KOKKOS_ADD_EXECUTABLE_AND_TEST. See GitHub issue #2147. + kokkos_add_test_executable(CoreUnitTest_PushFinalizeHookTerminate SOURCES UnitTest_PushFinalizeHook_terminate.cpp) + add_test(NAME Kokkos_CoreUnitTest_PushFinalizeHookTerminateRegex + COMMAND ${CMAKE_COMMAND} -E env $<TARGET_FILE:Kokkos_CoreUnitTest_PushFinalizeHookTerminate> + ) + set_property( + TEST Kokkos_CoreUnitTest_PushFinalizeHookTerminateRegex PROPERTY PASS_REGULAR_EXPRESSION + "PASSED: I am the custom std::terminate handler." + ) + add_test(NAME Kokkos_CoreUnitTest_PushFinalizeHookTerminateFails + COMMAND ${CMAKE_COMMAND} -E env $<TARGET_FILE:Kokkos_CoreUnitTest_PushFinalizeHookTerminate> + ) + set_property(TEST Kokkos_CoreUnitTest_PushFinalizeHookTerminateFails PROPERTY WILL_FAIL TRUE) +endif() -# This test is special, because it passes exactly when it prints the -# message "PASSED: I am the custom std::terminate handler.", AND calls -# std::terminate. This means that we can't use -# KOKKOS_ADD_EXECUTABLE_AND_TEST. See GitHub issue #2147. +if(KOKKOS_ENABLE_TUNING) + kokkos_add_executable_and_test(CoreUnitTest_TuningBuiltins SOURCES tools/TestBuiltinTuners.cpp) + kokkos_add_executable_and_test(CoreUnitTest_TuningBasics SOURCES tools/TestTuning.cpp) + kokkos_add_executable_and_test(CoreUnitTest_CategoricalTuner SOURCES tools/TestCategoricalTuner.cpp) +endif() -KOKKOS_ADD_TEST_EXECUTABLE( push_finalize_hook_terminate - SOURCES UnitTest_PushFinalizeHook_terminate.cpp +set(KOKKOSP_SOURCES UnitTestMainInit.cpp tools/TestEventCorrectness.cpp tools/TestKernelNames.cpp + tools/TestProfilingSection.cpp tools/TestScopedRegion.cpp tools/TestWithoutInitializing.cpp ) -KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate - TEST_0 - EXEC push_finalize_hook_terminate - NUM_MPI_PROCS 1 - PASS_REGULAR_EXPRESSION - "PASSED: I am the custom std::terminate handler." - ALWAYS_FAIL_ON_ZERO_RETURN -) - if(KOKKOS_ENABLE_TUNING) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_TuningBuiltins - SOURCES - tools/TestBuiltinTuners.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_TuningBasics - SOURCES - tools/TestTuning.cpp - ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CategoricalTuner - SOURCES - tools/TestCategoricalTuner.cpp - ) - endif() - if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_LogicalSpaces - SOURCES - tools/TestLogicalSpaces.cpp - ) - endif() - SET(KOKKOSP_SOURCES - UnitTestMainInit.cpp - tools/TestEventCorrectness.cpp - tools/TestWithoutInitializing.cpp - tools/TestProfilingSection.cpp - tools/TestScopedRegion.cpp - ) +# FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 +# when compiling for Intel's Xe-HP GPUs. +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) + list(REMOVE_ITEM KOKKOSP_SOURCES tools/TestEventCorrectness.cpp) +endif() - # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 - # when compiling for Intel's Xe-HP GPUs. - if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM KOKKOSP_SOURCES tools/TestEventCorrectness.cpp) +kokkos_add_executable_and_test(CoreUnitTest_KokkosP SOURCES ${KOKKOSP_SOURCES}) +if(KOKKOS_ENABLE_LIBDL) + kokkos_add_executable_and_test(CoreUnitTest_ToolIndependence SOURCES tools/TestIndependence.cpp) + target_compile_definitions(Kokkos_CoreUnitTest_ToolIndependence PUBLIC KOKKOS_TOOLS_INDEPENDENT_BUILD) + kokkos_add_test_library(kokkosprinter-tool SHARED SOURCES tools/printing-tool.cpp) + + if((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + target_compile_features(kokkosprinter-tool PUBLIC cxx_std_14) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_KokkosP - SOURCES - ${KOKKOSP_SOURCES} + kokkos_add_test_executable(ProfilingAllCalls tools/TestAllCalls.cpp) + + kokkos_add_test_executable(ToolsInitialization UnitTestMain.cpp tools/TestToolsInitialization.cpp) + + set(ADDRESS_REGEX "0x[0-9a-f]*") + set(MEMSPACE_REGEX "[HC][ou][sd][ta][a-zA-Z]*") + set(SIZE_REGEX "[0-9]*") + set(SKIP_SCRATCH_INITIALIZATION_REGEX ".*") + + # check help works via environment variable + kokkos_add_test( + SKIP_TRIBITS + NAME + ProfilingTestLibraryLoadHelp + EXE + ProfilingAllCalls + TOOL + kokkosprinter-tool + ARGS + --kokkos-tools-help + PASS_REGULAR_EXPRESSION + "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::" ) - if(KOKKOS_ENABLE_LIBDL) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_ToolIndependence - SOURCES - tools/TestIndependence.cpp - ) - TARGET_COMPILE_DEFINITIONS( - Kokkos_CoreUnitTest_ToolIndependence PUBLIC - KOKKOS_TOOLS_INDEPENDENT_BUILD - ) - KOKKOS_ADD_TEST_LIBRARY( - kokkosprinter-tool SHARED - SOURCES tools/printing-tool.cpp - ) - - if((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - TARGET_COMPILE_FEATURES(kokkosprinter-tool PUBLIC cxx_std_14) - endif() - KOKKOS_ADD_TEST_EXECUTABLE( - ProfilingAllCalls - tools/TestAllCalls.cpp - ) - - KOKKOS_ADD_TEST_EXECUTABLE( - ToolsInitialization - UnitTestMain.cpp - tools/TestToolsInitialization.cpp - ) + # check help works via direct library specification + kokkos_add_test( + SKIP_TRIBITS + NAME + ProfilingTestLibraryCmdLineHelp + EXE + ProfilingAllCalls + ARGS + --kokkos-tools-help + --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool> + PASS_REGULAR_EXPRESSION + "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::" + ) - set(ADDRESS_REGEX "0x[0-9a-f]*") - set(MEMSPACE_REGEX "[HC][ou][sd][ta][a-zA-Z]*") - set(SIZE_REGEX "[0-9]*") - set(SKIP_SCRATCH_INITIALIZATION_REGEX ".*") - - # check help works via environment variable - KOKKOS_ADD_TEST( - SKIP_TRIBITS - NAME ProfilingTestLibraryLoadHelp - EXE ProfilingAllCalls - TOOL kokkosprinter-tool - ARGS --kokkos-tools-help - PASS_REGULAR_EXPRESSION - "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::") - - # check help works via direct library specification - KOKKOS_ADD_TEST( - SKIP_TRIBITS - NAME ProfilingTestLibraryCmdLineHelp - EXE ProfilingAllCalls - ARGS --kokkos-tools-help - --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool> - PASS_REGULAR_EXPRESSION - "kokkosp_init_library::kokkosp_print_help:Kokkos_ProfilingAllCalls::kokkosp_finalize_library::") - - KOKKOS_ADD_TEST( - SKIP_TRIBITS - NAME ProfilingTestLibraryLoad - EXE ProfilingAllCalls - TOOL kokkosprinter-tool - ARGS --kokkos-tools-args="-c test delimit" - PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" - ) + kokkos_add_test( + SKIP_TRIBITS + NAME + ProfilingTestLibraryLoad + EXE + ProfilingAllCalls + TOOL + kokkosprinter-tool + ARGS + --kokkos-tools-args="-c test delimit" + PASS_REGULAR_EXPRESSION + "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" + ) - # Above will test that leading/trailing quotes are stripped bc ctest cmd args is: - # "--kokkos-tools-args="-c test delimit"" - # The bracket argument syntax: [=[ and ]=] used below ensures it is treated as - # a single argument: - # "--kokkos-tools-args=-c test delimit" - # - # https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument - # - KOKKOS_ADD_TEST( - SKIP_TRIBITS - NAME ProfilingTestLibraryCmdLine - EXE ProfilingAllCalls - ARGS [=[--kokkos-tools-args=-c test delimit]=] - --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool> - PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" - ) - endif() #KOKKOS_ENABLE_LIBDL -if(NOT KOKKOS_HAS_TRILINOS) -KOKKOS_ADD_TEST_EXECUTABLE( + # Above will test that leading/trailing quotes are stripped bc ctest cmd args is: + # "--kokkos-tools-args="-c test delimit"" + # The bracket argument syntax: [=[ and ]=] used below ensures it is treated as + # a single argument: + # "--kokkos-tools-args=-c test delimit" + # + # https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument + # + kokkos_add_test( + SKIP_TRIBITS + NAME + ProfilingTestLibraryCmdLine + EXE + ProfilingAllCalls + ARGS + [=[--kokkos-tools-args=-c test delimit]=] + --kokkos-tools-libs=$<TARGET_FILE:kokkosprinter-tool> + PASS_REGULAR_EXPRESSION + "kokkosp_init_library::kokkosp_parse_args:4:Kokkos_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::.*kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" + ) +endif() #KOKKOS_ENABLE_LIBDL +kokkos_add_test_executable( StackTraceTestExec SOURCES - TestStackTrace.cpp - TestStackTrace_f0.cpp - TestStackTrace_f1.cpp - TestStackTrace_f2.cpp - TestStackTrace_f3.cpp - TestStackTrace_f4.cpp + TestStackTrace.cpp + TestStackTrace_f0.cpp + TestStackTrace_f1.cpp + TestStackTrace_f2.cpp + TestStackTrace_f3.cpp + TestStackTrace_f4.cpp ) # We need -rdynamic on GNU platforms for the stacktrace functionality # to work correctly with shared libraries -KOKKOS_SET_EXE_PROPERTY(StackTraceTestExec ENABLE_EXPORTS ON) +kokkos_set_exe_property(StackTraceTestExec ENABLE_EXPORTS ON) -KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest - EXE StackTraceTestExec - FAIL_REGULAR_EXPRESSION "FAILED" - ) -endif() +kokkos_add_test(NAME CoreUnitTest_StackTraceTest EXE StackTraceTestExec FAIL_REGULAR_EXPRESSION "FAILED") -if(Kokkos_ENABLE_DEPRECATED_CODE_3) - foreach(INITTESTS_NUM RANGE 1 18) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_DefaultInit_${INITTESTS_NUM} - SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp - ) - endforeach(INITTESTS_NUM) +if(KOKKOS_ENABLE_HWLOC) + kokkos_add_executable_and_test(CoreUnitTest_HWLOC SOURCES UnitTestMain.cpp TestHWLOC.cpp) endif() -if (KOKKOS_ENABLE_HWLOC) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_HWLOC - SOURCES UnitTestMain.cpp TestHWLOC.cpp -) -endif() - -FUNCTION (KOKKOS_ADD_INCREMENTAL_TEST DEVICE) - KOKKOS_OPTION( ${DEVICE}_EXCLUDE_TESTS "" STRING "Incremental test exclude list" ) +function(KOKKOS_ADD_INCREMENTAL_TEST DEVICE) + kokkos_option(${DEVICE}_EXCLUDE_TESTS "" STRING "Incremental test exclude list") # Add unit test main - SET(${DEVICE}_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/UnitTestMainInit.cpp) + set(${DEVICE}_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/UnitTestMainInit.cpp) # Iterate over incremental tests in directory - APPEND_GLOB(INCREMENTAL_FILE_LIST ${CMAKE_CURRENT_SOURCE_DIR}/incremental/*.hpp) - - SET(DEVICE_NAME ${KOKKOS_${DEVICE}_NAME}) - FOREACH (CURRENT_FILE_PATH ${INCREMENTAL_FILE_LIST}) - GET_FILENAME_COMPONENT( CURRENT_FILE_NAME ${CURRENT_FILE_PATH} NAME ) - STRING (REPLACE ".hpp" "" CURRENT_TEST_NAME ${CURRENT_FILE_NAME}) - IF (NOT CURRENT_TEST_NAME IN_LIST Kokkos_${DEVICE}_EXCLUDE_TESTS) - SET (CURRENT_TEST_OUTPUT_FILENAME ${CURRENT_TEST_NAME}_${DEVICE}) - FILE( STRINGS ${CURRENT_FILE_PATH} CURRENT_REQUIRED_FEATURE_LINE REGEX "Kokkos_Feature_Level_Required" ) - # From each test get level implementation required - STRING( REGEX REPLACE ".*Kokkos_Feature_Level_Required:" "" CURRENT_REQUIRED_FEATURE_LEVEL ${CURRENT_REQUIRED_FEATURE_LINE} ) - # Cross-reference list of dependencies with selected feature list > matching feature test files are added to test applications - IF (KOKKOS_${DEVICE}_FEATURE_LEVEL GREATER_EQUAL CURRENT_REQUIRED_FEATURE_LEVEL) - CONFIGURE_FILE (IncrementalTest.cpp.in ${CMAKE_BINARY_DIR}/core/unit_test/generated/${CURRENT_TEST_OUTPUT_FILENAME}.cpp ) - SET(${DEVICE}_SOURCES ${${DEVICE}_SOURCES}; ${CMAKE_BINARY_DIR}/core/unit_test/generated/${CURRENT_TEST_OUTPUT_FILENAME}.cpp) - ENDIF() - ENDIF() - ENDFOREACH() - - STRING(TOUPPER ${DEVICE} UC_DEVICE) - - KOKKOS_OPTION ( - ENABLE_${UC_DEVICE} ON BOOL "ENABLE ${UC_DEVICE}" - ) + append_glob(INCREMENTAL_FILE_LIST ${CMAKE_CURRENT_SOURCE_DIR}/incremental/*.hpp) + + set(DEVICE_NAME ${KOKKOS_${DEVICE}_NAME}) + foreach(CURRENT_FILE_PATH ${INCREMENTAL_FILE_LIST}) + get_filename_component(CURRENT_FILE_NAME ${CURRENT_FILE_PATH} NAME) + string(REPLACE ".hpp" "" CURRENT_TEST_NAME ${CURRENT_FILE_NAME}) + if(NOT CURRENT_TEST_NAME IN_LIST Kokkos_${DEVICE}_EXCLUDE_TESTS) + set(CURRENT_TEST_OUTPUT_FILENAME ${CURRENT_TEST_NAME}_${DEVICE}) + file(STRINGS ${CURRENT_FILE_PATH} CURRENT_REQUIRED_FEATURE_LINE REGEX "Kokkos_Feature_Level_Required") + # From each test get level implementation required + string(REGEX REPLACE ".*Kokkos_Feature_Level_Required:" "" CURRENT_REQUIRED_FEATURE_LEVEL + ${CURRENT_REQUIRED_FEATURE_LINE} + ) + # Cross-reference list of dependencies with selected feature list > matching feature test files are added to test applications + if(KOKKOS_${DEVICE}_FEATURE_LEVEL GREATER_EQUAL CURRENT_REQUIRED_FEATURE_LEVEL) + configure_file( + IncrementalTest.cpp.in ${CMAKE_BINARY_DIR}/core/unit_test/generated/${CURRENT_TEST_OUTPUT_FILENAME}.cpp + ) + set(${DEVICE}_SOURCES ${${DEVICE}_SOURCES}; + ${CMAKE_BINARY_DIR}/core/unit_test/generated/${CURRENT_TEST_OUTPUT_FILENAME}.cpp + ) + endif() + endif() + endforeach() - KOKKOS_ADD_EXECUTABLE_AND_TEST( - IncrementalTest_${DEVICE} - SOURCES ${${DEVICE}_SOURCES} - ) + string(TOUPPER ${DEVICE} UC_DEVICE) + + kokkos_option(ENABLE_${UC_DEVICE} ON BOOL "ENABLE ${UC_DEVICE}") + + kokkos_add_executable_and_test(IncrementalTest_${DEVICE} SOURCES ${${DEVICE}_SOURCES}) - SET(EXE_NAME ${PACKAGE_NAME}_IncrementalTest_${DEVICE}) + set(EXE_NAME ${PACKAGE_NAME}_IncrementalTest_${DEVICE}) # Check that the target was actually created because in a TribITS build # where only tests marked as PERFORMANCE enabled it would not be. - IF(TARGET ${EXE_NAME}) - TARGET_INCLUDE_DIRECTORIES(${EXE_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/incremental ) - ENDIF() + if(TARGET ${EXE_NAME}) + target_include_directories(${EXE_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/incremental) + endif() -ENDFUNCTION() +endfunction() -FOREACH (DEVICE ${KOKKOS_ENABLED_DEVICES}) - KOKKOS_ADD_INCREMENTAL_TEST(${DEVICE}) -ENDFOREACH() +foreach(DEVICE ${KOKKOS_ENABLED_DEVICES}) + kokkos_add_incremental_test(${DEVICE}) +endforeach() -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CTestDevice - SOURCES UnitTestMain.cpp TestCTestDevice.cpp -) +kokkos_add_executable_and_test(CoreUnitTest_CTestDevice SOURCES UnitTestMain.cpp TestCTestDevice.cpp) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_CMakePassCmdLineArgs - SOURCES UnitTest_CMakePassCmdLineArgs.cpp - ARGS "one 2 THREE" +kokkos_add_executable_and_test( + CoreUnitTest_CMakePassCmdLineArgs SOURCES UnitTest_CMakePassCmdLineArgs.cpp ARGS "one 2 THREE" ) -# This test is not properly set up to run within Trilinos -if (NOT KOKKOS_HAS_TRILINOS) - SET_SOURCE_FILES_PROPERTIES(UnitTest_DeviceAndThreads.cpp PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - add_executable(Kokkos_CoreUnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp) - target_link_libraries(Kokkos_CoreUnitTest_DeviceAndThreads Kokkos::kokkoscore) - find_package(Python3 COMPONENTS Interpreter) - if(Python3_Interpreter_FOUND AND Python3_VERSION VERSION_GREATER_EQUAL 3.7) - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20) - set(USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED USE_SOURCE_PERMISSIONS) - endif() - file(GENERATE - OUTPUT $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py - INPUT TestDeviceAndThreads.py - ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} - ) - if(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET does not select the right device - add_test( - NAME Kokkos_CoreUnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py - ) - endif() +kokkos_add_executable(CoreUnitTest_CMakeTriBITSCompatibility SOURCES UnitTest_CMakeTriBITSCompatibility.cpp) + +kokkos_add_test(NAME CoreUnitTest_CMakeTriBITSCompatibilityWillFail EXE CoreUnitTest_CMakeTriBITSCompatibility) +set_property(TEST Kokkos_CoreUnitTest_CMakeTriBITSCompatibilityWillFail PROPERTY WILL_FAIL TRUE) + +set(Kokkos_CoreUnitTest_CMakeTriBITSCompatibilityDisable_DISABLE ON) +kokkos_add_test(NAME CoreUnitTest_CMakeTriBITSCompatibilityDisable EXE CoreUnitTest_CMakeTriBITSCompatibility) + +set(Kokkos_CoreUnitTest_CMakeTriBITSCompatibilityExtraArgs_EXTRA_ARGS "--kokkos-test-tribits-compatibility=1") +kokkos_add_test(NAME CoreUnitTest_CMakeTriBITSCompatibilityExtraArgs EXE CoreUnitTest_CMakeTriBITSCompatibility) + +set(Kokkos_CoreUnitTest_CMakeTriBITSCompatibilityEnvironment_ENVIRONMENT "KOKKOS_TEST_TRIBITS_COMPATIBILITY=1") +kokkos_add_test(NAME CoreUnitTest_CMakeTriBITSCompatibilityEnvironment EXE CoreUnitTest_CMakeTriBITSCompatibility) + +set_source_files_properties(UnitTest_DeviceAndThreads.cpp PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) +add_executable(Kokkos_CoreUnitTest_DeviceAndThreads UnitTest_DeviceAndThreads.cpp) +target_link_libraries(Kokkos_CoreUnitTest_DeviceAndThreads Kokkos::kokkoscore) +find_package(Python3 COMPONENTS Interpreter) +if(Python3_Interpreter_FOUND AND Python3_VERSION VERSION_GREATER_EQUAL 3.7) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20) + set(USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED USE_SOURCE_PERMISSIONS) endif() + file( + GENERATE + OUTPUT $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py + INPUT TestDeviceAndThreads.py + ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} + ) + add_test(NAME Kokkos_CoreUnitTest_DeviceAndThreads + COMMAND ${Python3_EXECUTABLE} + $<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>/TestDeviceAndThreads.py -v + ) endif() -if (KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS AND NOT KOKKOS_HAS_TRILINOS AND NOT WIN32) +if(KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS AND NOT WIN32) add_subdirectory(headers_self_contained) endif() diff --git a/packages/kokkos/core/unit_test/IncrementalTest.cpp.in b/packages/kokkos/core/unit_test/IncrementalTest.cpp.in index b12182e988e3cf58c73ffba69564101ac8d41b03..2be2d52aeaf1d46ac701021a84b508f32d7880e5 100644 --- a/packages/kokkos/core/unit_test/IncrementalTest.cpp.in +++ b/packages/kokkos/core/unit_test/IncrementalTest.cpp.in @@ -23,9 +23,7 @@ #define TEST_CATEGORY @DEVICE@ #define TEST_EXECSPACE Kokkos::@DEVICE_NAME@ -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #include <@CURRENT_FILE_NAME@> -#endif #endif diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile index 33a84b61f92a82fc4e668fd640149afa113e06d5..c4f798d9f71e926dae4374c058fe15b562356cfb 100644 --- a/packages/kokkos/core/unit_test/Makefile +++ b/packages/kokkos/core/unit_test/Makefile @@ -62,18 +62,18 @@ else STACK_TRACE_TERMINATE_FILTER := endif -TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize +TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other ParallelScanRangePolicy RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reducers_e Reductions_DeviceView SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ ) \ ) \ ) -GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview +GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewCopy_c ViewMapping_a ViewMapping_b ViewMapping_subview SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13 @@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ - $(shell echo "\#include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ ) \ )\ ) @@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include <TestCudaUVM_Category.hpp>" > TestCuda_$(test).cpp); \ - $(shell echo "\#include <Test"$(test)".hpp>" >> TestCuda_$(test).cpp); \ + $(shell echo "$(H)include <TestCudaUVM_Category.hpp>" > TestCuda_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(test)".hpp>" >> TestCuda_$(test).cpp); \ )\ ) @@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \ - $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -110,14 +110,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA += TestCuda_Init.o OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o OBJ_CUDA += TestCuda_RangePolicy.o TestCuda_RangePolicyRequire.o - OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o + OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o TestCuda_ViewCopy_c.o OBJ_CUDA += TestCuda_DeepCopyAlignment.o OBJ_CUDA += TestCuda_ViewMapping_a.o TestCuda_ViewMapping_b.o TestCuda_ViewMapping_subview.o TestCuda_ViewResize.o TestCuda_ViewLayoutStrideAssignment.o OBJ_CUDA += TestCudaUVM_ViewAPI_a.o TestCudaUVM_ViewAPI_b.o TestCudaUVM_ViewAPI_c.o TestCudaUVM_ViewAPI_d.o TestCudaUVM_ViewAPI_e.o - OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o + OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o TestCudaUVM_ViewCopy_c.o OBJ_CUDA += TestCudaUVM_ViewMapping_a.o TestCudaUVM_ViewMapping_b.o TestCudaUVM_ViewMapping_subview.o OBJ_CUDA += TestCudaHostPinned_ViewAPI_a.o TestCudaHostPinned_ViewAPI_b.o TestCudaHostPinned_ViewAPI_c.o TestCudaHostPinned_ViewAPI_d.o TestCudaHostPinned_ViewAPI_e.o - OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o + OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o TestCudaHostPinned_ViewCopy_c.o OBJ_CUDA += TestCudaHostPinned_ViewMapping_a.o TestCudaHostPinned_ViewMapping_b.o TestCudaHostPinned_ViewMapping_subview.o OBJ_CUDA += TestCuda_View_64bit.o OBJ_CUDA += TestCuda_ViewOfClass.o @@ -141,7 +141,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA += TestCuda_Other.o OBJ_CUDA += TestCuda_MDRange_a.o TestCuda_MDRange_b.o TestCuda_MDRange_c.o TestCuda_MDRange_d.o TestCuda_MDRange_e.o OBJ_CUDA += TestCuda_Crs.o - OBJ_CUDA += TestCuda_Task.o TestCuda_WorkGraph.o + ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + OBJ_CUDA += TestCuda_Task.o + endif + OBJ_CUDA += TestCuda_WorkGraph.o OBJ_CUDA += TestCuda_Spaces.o OBJ_CUDA += TestCuda_UniqueToken.o OBJ_CUDA += TestCuda_LocalDeepCopy.o @@ -162,7 +165,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) OBJ_THREADS += TestThreads_RangePolicy.o TestThreads_RangePolicyRequire.o OBJ_THREADS += TestThreads_View_64bit.o OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o TestThreads_ViewAPI_c.o TestThreads_ViewAPI_d.o TestThreads_ViewAPI_e.o - OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o + OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o TestThreads_ViewCopy_c.o OBJ_THREADS += TestThreads_DeepCopyAlignment.o OBJ_THREADS += TestThreads_ViewMapping_a.o TestThreads_ViewMapping_b.o TestThreads_ViewMapping_subview.o TestThreads_ViewResize.o TestThreads_ViewLayoutStrideAssignment.o OBJ_THREADS += TestThreads_ViewOfClass.o @@ -198,7 +201,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) OBJ_OPENMP += TestOpenMP_RangePolicy.o TestOpenMP_RangePolicyRequire.o OBJ_OPENMP += TestOpenMP_View_64bit.o OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o TestOpenMP_ViewAPI_c.o TestOpenMP_ViewAPI_d.o TestOpenMP_ViewAPI_e.o - OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o + OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o TestOpenMP_ViewCopy_c.o OBJ_OPENMP += TestOpenMP_ViewMapping_a.o TestOpenMP_ViewMapping_b.o TestOpenMP_ViewMapping_subview.o TestOpenMP_ViewResize.o TestOpenMP_ViewLayoutStrideAssignment.o OBJ_OPENMP += TestOpenMP_ViewOfClass.o OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o @@ -221,7 +224,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) OBJ_OPENMP += TestOpenMP_Other.o OBJ_OPENMP += TestOpenMP_MDRange_a.o TestOpenMP_MDRange_b.o TestOpenMP_MDRange_c.o TestOpenMP_MDRange_d.o TestOpenMP_MDRange_e.o OBJ_OPENMP += TestOpenMP_Crs.o - OBJ_OPENMP += TestOpenMP_Task.o TestOpenMP_WorkGraph.o + ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + OBJ_OPENMP += TestOpenMP_Task.o + endif + OBJ_OPENMP += TestOpenMP_WorkGraph.o OBJ_OPENMP += TestOpenMP_UniqueToken.o OBJ_OPENMP += TestOpenMP_LocalDeepCopy.o @@ -237,7 +243,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) #OBJ_OPENMPTARGET += TestOpenMPTarget_SharedAlloc.o OBJ_OPENMPTARGET += TestOpenMPTarget_RangePolicy.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_a.o TestOpenMPTarget_ViewAPI_b.o TestOpenMPTarget_ViewAPI_c.o TestOpenMPTarget_ViewAPI_d.o #Some commented out code - #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o TestOpenMPTarget_ViewCopy_c.o OBJ_OPENMPTARGET += TestOpenMPTarget_DeepCopyAlignment.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_a.o OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_b.o @@ -265,7 +271,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) #OBJ_OPENMPTARGET += TestOpenMPTarget_TeamReductionScan.o #OBJ_OPENMPTARGET += TestOpenMPTarget_Other.o #OBJ_OPENMPTARGET += TestOpenMPTarget_MDRange_a.o TestOpenMPTarget_MDRange_b.o TestOpenMPTarget_MDRange_c.o TestOpenMPTarget_MDRange_d.o TestOpenMPTarget_MDRange_d.e - #OBJ_OPENMPTARGET += TestOpenMPTarget_Task.o TARGETS += KokkosCore_UnitTest_OpenMPTarget @@ -277,8 +282,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \ - $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -292,7 +297,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) OBJ_HIP += TestHIP_Memory_Requirements.o OBJ_HIP += TestHIP_ParallelScanRangePolicy.o OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o - OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o + OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o TestHIPHostPinned_ViewCopy_c.o OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o TARGETS += KokkosCore_UnitTest_HIP @@ -307,7 +312,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) OBJ_HPX += TestHPX_RangePolicy.o TestHPX_RangePolicyRequire.o OBJ_HPX += TestHPX_View_64bit.o OBJ_HPX += TestHPX_ViewAPI_a.o TestHPX_ViewAPI_b.o TestHPX_ViewAPI_c.o TestHPX_ViewAPI_d.o TestHPX_ViewAPI_e.o - OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o + OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o TestHPX_ViewCopy_c.o OBJ_HPX += TestHPX_ViewMapping_a.o TestHPX_ViewMapping_b.o TestHPX_ViewMapping_subview.o TestHPX_ViewResize.o OBJ_HPX += TestHPX_ViewOfClass.o OBJ_HPX += TestHPX_SubView_a.o TestHPX_SubView_b.o @@ -330,7 +335,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) OBJ_HPX += TestHPX_Other.o OBJ_HPX += TestHPX_MDRange_a.o TestHPX_MDRange_b.o TestHPX_MDRange_c.o TestHPX_MDRange_d.o TestHPX_MDRange_e.o OBJ_HPX += TestHPX_Crs.o - OBJ_HPX += TestHPX_Task.o + ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + OBJ_HPX += TestHPX_Task.o + endif OBJ_HPX += TestHPX_WorkGraph.o OBJ_HPX += TestHPX_UniqueToken.o @@ -347,7 +354,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) OBJ_SERIAL += TestSerial_RangePolicy.o TestSerial_RangePolicyRequire.o OBJ_SERIAL += TestSerial_View_64bit.o OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o TestSerial_ViewAPI_c.o TestSerial_ViewAPI_d.o TestSerial_ViewAPI_e.o - OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o + OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o TestSerial_ViewCopy_c.o OBJ_SERIAL += TestSerial_ViewMapping_a.o TestSerial_ViewMapping_b.o TestSerial_ViewMapping_subview.o TestSerial_ViewResize.o TestSerial_ViewLayoutStrideAssignment.o OBJ_SERIAL += TestSerial_ViewOfClass.o OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o @@ -373,7 +380,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) OBJ_SERIAL += TestSerial_MDRange_a.o TestSerial_MDRange_b.o TestSerial_MDRange_c.o TestSerial_MDRange_d.o TestSerial_MDRange_e.o endif OBJ_SERIAL += TestSerial_Crs.o - OBJ_SERIAL += TestSerial_Task.o TestSerial_WorkGraph.o + ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + OBJ_SERIAL += TestSerial_Task.o + endif + OBJ_SERIAL += TestSerial_WorkGraph.o OBJ_SERIAL += TestSerial_LocalDeepCopy.o TARGETS += KokkosCore_UnitTest_Serial @@ -392,7 +402,6 @@ ifneq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1) OBJ_DEFAULT += TestDefaultDeviceType_a1.o TestDefaultDeviceType_b1.o TestDefaultDeviceType_c1.o OBJ_DEFAULT += TestDefaultDeviceType_a2.o TestDefaultDeviceType_b2.o TestDefaultDeviceType_c2.o OBJ_DEFAULT += TestDefaultDeviceType_a3.o TestDefaultDeviceType_b3.o TestDefaultDeviceType_c3.o - OBJ_DEFAULT += TestDefaultDeviceType_d.o endif endif diff --git a/packages/kokkos/core/unit_test/TestAbort.hpp b/packages/kokkos/core/unit_test/TestAbort.hpp index 6e51ef6ee715c5a96c1533386d0f162251bd0aef..6fa4a8d4f80fc7a0309fe0900275257b6a085773 100644 --- a/packages/kokkos/core/unit_test/TestAbort.hpp +++ b/packages/kokkos/core/unit_test/TestAbort.hpp @@ -87,7 +87,7 @@ void test_abort_from_device() { TestAbortCausingAbnormalProgramTerminationAndPrinting<ExecutionSpace>(); } #elif defined(KOKKOS_ENABLE_SYCL) // FIXME_SYCL - if (std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value) { + if (std::is_same<ExecutionSpace, Kokkos::SYCL>::value) { #ifdef NDEBUG TestAbortPrintingToStdout<ExecutionSpace>(); #else @@ -103,5 +103,14 @@ void test_abort_from_device() { TEST(TEST_CATEGORY_DEATH, abort_from_device) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; +// FIXME_OPENACC FIXME_NVHPC: NVHPC fails when targetting CPUs. +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) && \ + defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenACC>) { + GTEST_SKIP() + << "skipping since the OpenACC backend compiled by NVHPC for CPU " + "crashes at runtime."; + } +#endif test_abort_from_device<TEST_EXECSPACE>(); } diff --git a/packages/kokkos/core/unit_test/TestAggregate.hpp b/packages/kokkos/core/unit_test/TestAggregate.hpp deleted file mode 100644 index 4f67b2eddcebe9c1fbb874e48923fd03995a787b..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/TestAggregate.hpp +++ /dev/null @@ -1,112 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef TEST_AGGREGATE_HPP -#define TEST_AGGREGATE_HPP - -#include <Kokkos_Core.hpp> - -namespace Test { - -template <class DeviceType> -void TestViewAggregate() { - using value_type = Kokkos::Array<double, 32>; - using analysis_1d = - Kokkos::Impl::ViewDataAnalysis<value_type *, Kokkos::LayoutLeft, - value_type>; - - static_assert( - std::is_same<typename analysis_1d::specialize, Kokkos::Array<> >::value, - ""); - - using a32_traits = Kokkos::ViewTraits<value_type **, DeviceType>; - using flat_traits = - Kokkos::ViewTraits<typename a32_traits::scalar_array_type, DeviceType>; - - static_assert( - std::is_same<typename a32_traits::specialize, Kokkos::Array<> >::value, - ""); - static_assert( - std::is_same<typename a32_traits::value_type, value_type>::value, ""); - static_assert(a32_traits::rank == 2, ""); - static_assert(a32_traits::rank_dynamic == 2, ""); - - static_assert(std::is_void<typename flat_traits::specialize>::value, ""); - static_assert(flat_traits::rank == 3, ""); - static_assert(flat_traits::rank_dynamic == 2, ""); - static_assert(flat_traits::dimension::N2 == 32, ""); - - using a32_type = Kokkos::View<Kokkos::Array<double, 32> **, DeviceType>; - using a32_flat_type = typename a32_type::array_type; - - static_assert(std::is_same<typename a32_type::value_type, value_type>::value, - ""); - static_assert(std::is_same<typename a32_type::pointer_type, double *>::value, - ""); - static_assert(a32_type::rank == 2, ""); - static_assert(a32_flat_type::rank == 3, ""); - - a32_type x("test", 4, 5); - a32_flat_type y(x); - - ASSERT_EQ(x.extent(0), 4u); - ASSERT_EQ(x.extent(1), 5u); - ASSERT_EQ(y.extent(0), 4u); - ASSERT_EQ(y.extent(1), 5u); - ASSERT_EQ(y.extent(2), 32u); - - // Initialize arrays from brace-init-list as for std::array. - // - // Comment: Clang will issue the following warning if we don't use double - // braces here (one for initializing the Kokkos::Array and one for - // initializing the sub-aggreagate C-array data member), - // - // warning: suggest braces around initialization of subobject - // - // but single brace syntax would be valid as well. - Kokkos::Array<float, 2> aggregate_initialization_syntax_1 = {{1.41, 3.14}}; - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[0], 1.41); - ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[1], 3.14); - - Kokkos::Array<int, 3> aggregate_initialization_syntax_2{ - {0, 1, 2}}; // since C++11 - for (int i = 0; i < 3; ++i) { - ASSERT_EQ(aggregate_initialization_syntax_2[i], i); - } - - // Note that this is a valid initialization. - Kokkos::Array<double, 3> initialized_with_one_argument_missing = {{255, 255}}; - for (int i = 0; i < 2; ++i) { - ASSERT_DOUBLE_EQ(initialized_with_one_argument_missing[i], 255); - } - // But the following line would not compile - // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; - - // The code below must compile for zero-sized arrays. - using T = float; - - constexpr int N = 0; - Kokkos::Array<T, N> a; - for (int i = 0; i < N; ++i) { - a[i] = T(); - } -} - -TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate<TEST_EXECSPACE>(); } - -} // namespace Test - -#endif /* #ifndef TEST_AGGREGATE_HPP */ diff --git a/packages/kokkos/core/unit_test/TestArray.cpp b/packages/kokkos/core/unit_test/TestArray.cpp index d3bdc4f93f7b81c2c78d69a51ed4739ecd641935..15f4794e4ac685ccc13327e411114453bda9930d 100644 --- a/packages/kokkos/core/unit_test/TestArray.cpp +++ b/packages/kokkos/core/unit_test/TestArray.cpp @@ -15,9 +15,19 @@ //@HEADER #include <Kokkos_Array.hpp> +#include <Kokkos_DetectionIdiom.hpp> namespace { +// nvcc errors on variables only used in static_asserts +// Passing those variables to this function should eliminate the warning +template <typename... Ts> +KOKKOS_FUNCTION constexpr void maybe_unused(Ts&&...) {} + +template <typename T, typename U = T> +using equality_comparable = + decltype(std::declval<T const&>() == std::declval<U const&>()); + KOKKOS_FUNCTION constexpr bool test_array() { constexpr Kokkos::Array<int, 3> a{{1, 2}}; @@ -49,4 +59,283 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() { static_assert(test_array_structured_binding_support()); +// Disable ctad test for intel versions < 2021, see issue #6702 +#if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021 +KOKKOS_FUNCTION constexpr bool test_array_ctad() { + constexpr int x = 10; + constexpr Kokkos::Array a{1, 2, 3, 5, x}; + constexpr Kokkos::Array<int, 5> b{1, 2, 3, 5, x}; + + return std::is_same_v<decltype(a), decltype(b)> && a == b; +} + +static_assert(test_array_ctad()); +#endif + +KOKKOS_FUNCTION constexpr bool test_array_aggregate_initialization() { + // Initialize arrays from brace-init-list as for std::array. + + Kokkos::Array<float, 2> aggregate_initialization_syntax_1 = {1.41f, 3.14f}; + if ((aggregate_initialization_syntax_1[0] != 1.41f) || + (aggregate_initialization_syntax_1[1] != 3.14f)) + return false; + + Kokkos::Array<int, 3> aggregate_initialization_syntax_2{ + {0, 1, 2}}; // since C++11 + if ((aggregate_initialization_syntax_2[0] != 0) || + (aggregate_initialization_syntax_2[1] != 1) || + (aggregate_initialization_syntax_2[2] != 2)) + return false; + + // Note that this is a valid initialization. + Kokkos::Array<double, 3> initialized_with_one_argument_missing = {{255, 255}}; + if ((initialized_with_one_argument_missing[0] != 255) || + (initialized_with_one_argument_missing[1] != 255) || + (initialized_with_one_argument_missing[2] != 0)) + return false; + + // But the following line would not compile + // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; + + return true; +} + +static_assert(test_array_aggregate_initialization()); + +// A few compilers, such as GCC 8.4, were erroring out when the function below +// appeared in a constant expression because +// Kokkos::Array<T, 0, Proxy>::operator[] is non-constexpr. The issue +// disappears with GCC 9.1 (https://godbolt.org/z/TG4TEef1b). As a workaround, +// the static_assert was dropped and the [[maybe_unused]] is used as an attempt +// to silent warnings that the function is never used. +[[maybe_unused]] KOKKOS_FUNCTION void test_array_zero_sized() { + using T = float; + + // The code below must compile for zero-sized arrays. + constexpr int N = 0; + Kokkos::Array<T, N> a; + for (int i = 0; i < N; ++i) { + a[i] = T(); + } +} + +constexpr bool test_array_const_qualified_element_type() { + Kokkos::Array<int const, 1> a{255}; + return a[0] == 255; +} + +static_assert(test_array_const_qualified_element_type()); + +// User-defined type providing a sepcialization of kokkos_swap +struct MyInt { + int i; + + private: + friend constexpr KOKKOS_FUNCTION void kokkos_swap(MyInt& lhs, + MyInt& rhs) noexcept { + lhs.i = 255; + rhs.i = 127; + } +}; + +constexpr bool test_array_specialization_kokkos_swap() { + Kokkos::Array<MyInt, 2> a{MyInt{1}, MyInt{2}}; + Kokkos::Array<MyInt, 2> b{MyInt{11}, MyInt{22}}; + + // sanity check + if (a[0].i != 1 || a[1].i != 2 || b[0].i != 11 || b[1].i != 22) { + return false; + } + + using Kokkos::kokkos_swap; + kokkos_swap(a, b); + + // check that the user-definied kokkos_swap(MyInt) overload was called + if (a[0].i != 255 || a[1].i != 255 || b[0].i != 127 || b[1].i != 127) { + return false; + } + + return true; +} + +static_assert(test_array_specialization_kokkos_swap()); + +constexpr bool test_to_array() { + // copies a string literal + [[maybe_unused]] auto a1 = Kokkos::to_array("foo"); + static_assert(a1.size() == 4); + maybe_unused(a1); + + // deduces both element type and length + [[maybe_unused]] auto a2 = Kokkos::to_array({0, 2, 1, 3}); + static_assert(std::is_same_v<decltype(a2), Kokkos::Array<int, 4>>); + maybe_unused(a2); + +// gcc8, icc, and nvcc 11.3 do not support the implicit conversion +#if !(defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 910)) && \ + !(defined(KOKKOS_COMPILER_INTEL) && (KOKKOS_COMPILER_INTEL < 2021)) && \ + !(defined(KOKKOS_COMPILER_NVCC) && (KOKKOS_COMPILER_NVCC < 1140)) + // deduces length with element type specified + // implicit conversion happens + [[maybe_unused]] auto a3 = Kokkos::to_array<long>({0, 1, 3}); + static_assert(std::is_same_v<decltype(a3), Kokkos::Array<long, 3>>); + maybe_unused(a3); +#endif + + return true; +} + +static_assert(test_to_array()); + +// making sure we cover both const and non-const cases by having a function that +// writes to an array and another one that reads from it +// also checking that it supports host device annotations +template <class T, size_t N> +KOKKOS_FUNCTION constexpr void iota(Kokkos::Array<T, N>& a, T value) { + for (auto& e : a) { + e = value++; + } +} + +template <class T, size_t N> +KOKKOS_FUNCTION constexpr T accumulate(Kokkos::Array<T, N> const& a, T init) { + T acc = init; + for (auto const& e : a) { + acc = acc + e; + } + return acc; +} + +constexpr bool test_range_based_for_loop() { + // making sure zero-sized arrays are supported + constexpr Kokkos::Array<int, 0> a0 = [] { + Kokkos::Array<int, 0> a{}; + iota(a, 1); + return a; + }(); + static_assert(accumulate(a0, 0) == 0); + + constexpr Kokkos::Array<int, 1> a1 = [] { + Kokkos::Array<int, 1> a{}; + iota(a, 1); + return a; + }(); + static_assert(accumulate(a1, 0) == 1); + + constexpr Kokkos::Array<int, 2> a2 = [] { + Kokkos::Array<int, 2> a{}; + iota(a, 1); + return a; + }(); + static_assert(accumulate(a2, 0) == 3); + + constexpr Kokkos::Array<int, 3> a3 = [] { + Kokkos::Array<int, 3> a{}; + iota(a, 1); + return a; + }(); + static_assert(accumulate(a3, 0) == 6); + + return true; +} + +static_assert(test_range_based_for_loop()); + +constexpr bool test_begin_end() { + constexpr Kokkos::Array<float, 0> a0{}; + static_assert(begin(a0) == nullptr); + static_assert(end(a0) == nullptr); + + constexpr Kokkos::Array<float, 1> a1{}; + static_assert(begin(a1) == &a1[0]); + static_assert(end(a1) == &a1[0] + a1.size()); + + [[maybe_unused]] Kokkos::Array<double, 0> n0{}; + static_assert(std::is_same_v<decltype(begin(n0)), double*>); + static_assert(std::is_same_v<decltype(end(n0)), double*>); + static_assert(std::is_same_v<double*, decltype(n0)::pointer>); + static_assert(noexcept(begin(n0))); + static_assert(noexcept(end(n0))); + + [[maybe_unused]] Kokkos::Array<double, 0> const c0{}; + static_assert(std::is_same_v<decltype(begin(c0)), double const*>); + static_assert(std::is_same_v<decltype(end(c0)), double const*>); + static_assert(std::is_same_v<double const*, decltype(c0)::const_pointer>); + static_assert(noexcept(begin(c0))); + static_assert(noexcept(end(c0))); + + [[maybe_unused]] Kokkos::Array<double, 1> n1{}; + static_assert(std::is_same_v<decltype(begin(n1)), double*>); + static_assert(std::is_same_v<decltype(end(n1)), double*>); + static_assert(std::is_same_v<double*, decltype(n1)::pointer>); + static_assert(noexcept(begin(n1))); + static_assert(noexcept(end(n1))); + + [[maybe_unused]] Kokkos::Array<double, 1> const c1{}; + static_assert(std::is_same_v<decltype(begin(c1)), double const*>); + static_assert(std::is_same_v<decltype(end(c1)), double const*>); + static_assert(std::is_same_v<double const*, decltype(c1)::const_pointer>); + static_assert(noexcept(begin(c1))); + static_assert(noexcept(end(c1))); + + return true; +} + +static_assert(test_begin_end()); + +constexpr bool test_array_equality_comparable() { + using C0 = Kokkos::Array<char, 0>; + using C2 = Kokkos::Array<char, 2>; + using C3 = Kokkos::Array<char, 3>; + using I0 = Kokkos::Array<int, 0>; + using I2 = Kokkos::Array<int, 2>; + using I3 = Kokkos::Array<int, 3>; + + static_assert(Kokkos::is_detected_v<equality_comparable, C0, C0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C0, C2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C0, C3>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C0, I0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C0, I2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C0, I3>); + + static_assert(!Kokkos::is_detected_v<equality_comparable, C2, C0>); + static_assert(Kokkos::is_detected_v<equality_comparable, C2, C2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C2, C3>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C2, I0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C2, I2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C2, I3>); + + static_assert(!Kokkos::is_detected_v<equality_comparable, C3, C0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C3, C2>); + static_assert(Kokkos::is_detected_v<equality_comparable, C3, C3>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C3, I0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C3, I2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, C3, I3>); + + static_assert(!Kokkos::is_detected_v<equality_comparable, I0, C0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I0, C2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I0, C3>); + static_assert(Kokkos::is_detected_v<equality_comparable, I0, I0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I0, I2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I0, I3>); + + static_assert(!Kokkos::is_detected_v<equality_comparable, I2, C0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I2, C2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I2, C3>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I2, I0>); + static_assert(Kokkos::is_detected_v<equality_comparable, I2, I2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I2, I3>); + + static_assert(!Kokkos::is_detected_v<equality_comparable, I3, C0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I3, C2>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I3, C3>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I3, I0>); + static_assert(!Kokkos::is_detected_v<equality_comparable, I3, I2>); + static_assert(Kokkos::is_detected_v<equality_comparable, I3, I3>); + + return true; +} + +static_assert(test_array_equality_comparable()); + } // namespace diff --git a/packages/kokkos/core/unit_test/TestArrayOps.hpp b/packages/kokkos/core/unit_test/TestArrayOps.hpp index 065285727147edf338592711427a2f161ca645c1..568c648e79fea034ec3942b4154839445b90d22d 100644 --- a/packages/kokkos/core/unit_test/TestArrayOps.hpp +++ b/packages/kokkos/core/unit_test/TestArrayOps.hpp @@ -92,6 +92,31 @@ TEST(TEST_CATEGORY, array_element_access) { ASSERT_EQ(ca.data()[index], a[index]); } +TEST(TEST_CATEGORY, array_operator_equal) { + using A = Kokkos::Array<int, 2>; + constexpr A a{{3, 5}}; + constexpr A b{{3, 5}}; + constexpr A c{{5, 3}}; + + static_assert(a == b); + static_assert(!(a == c)); + static_assert(a != c); + + ASSERT_TRUE(a == b); + ASSERT_FALSE(a == c); + ASSERT_TRUE(a != c); + + using E = Kokkos::Array<int, 0>; + constexpr E e; + constexpr E f; + + static_assert(e == f); + static_assert(!(e != f)); + + ASSERT_TRUE(e == f); + ASSERT_FALSE(e != f); +} + TEST(TEST_CATEGORY, array_zero_capacity) { using A = Kokkos::Array<int, 0>; A e; @@ -111,6 +136,10 @@ TEST(TEST_CATEGORY, array_zero_data_nullptr) { ASSERT_EQ(ce.data(), nullptr); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif TEST(TEST_CATEGORY, array_contiguous_capacity) { using A = Kokkos::Array<int, KOKKOS_INVALID_INDEX, Kokkos::Array<>::contiguous>; @@ -389,5 +418,10 @@ TEST(TEST_CATEGORY, array_strided_assignment) { ASSERT_EQ(e.max_size(), std::size(ee) / eStride); ASSERT_EQ(e[0], ee[0]); } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + +#endif } // namespace diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp index a5aebed41380f7b01e250e87698faad96cf231cc..12884d3b13a42cd0db352e75cc1a21983be2a765 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp @@ -16,6 +16,7 @@ #include <Kokkos_Core.hpp> #include <Kokkos_Pair.hpp> +#include <iostream> namespace TestAtomicOperations { @@ -151,8 +152,7 @@ struct ModAtomicTest { template <class T> KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, T* ptr_op_fetch, T update) { - // Kokkos::atomic_mod(ptr_op, update); - (void)Kokkos::atomic_fetch_mod(ptr_op, update); + Kokkos::atomic_mod(ptr_op, update); T old_val = Kokkos::atomic_fetch_mod(ptr_fetch_op, update); T new_val = Kokkos::atomic_mod_fetch(ptr_op_fetch, update); return Kokkos::pair<T, T>(old_val, new_val); @@ -200,8 +200,7 @@ struct XorAtomicTest { template <class T> KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, T* ptr_op_fetch, T update) { - // Kokkos::atomic_xor(ptr_op, update); - (void)Kokkos::atomic_fetch_xor(ptr_op, update); + Kokkos::atomic_xor(ptr_op, update); T old_val = Kokkos::atomic_fetch_xor(ptr_fetch_op, update); T new_val = Kokkos::atomic_xor_fetch(ptr_op_fetch, update); return Kokkos::pair<T, T>(old_val, new_val); @@ -217,8 +216,7 @@ struct NandAtomicTest { template <class T> KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, T* ptr_op_fetch, T update) { - // Kokkos::atomic_nand(ptr_op, update); - (void)Kokkos::atomic_fetch_nand(ptr_op, update); + Kokkos::atomic_nand(ptr_op, update); T old_val = Kokkos::atomic_fetch_nand(ptr_fetch_op, update); T new_val = Kokkos::atomic_nand_fetch(ptr_op_fetch, update); return Kokkos::pair<T, T>(old_val, new_val); @@ -234,8 +232,7 @@ struct LShiftAtomicTest { template <class T> KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, T* ptr_op_fetch, T update) { - // Kokkos::atomic_lshift(ptr_op, update); - (void)Kokkos::atomic_fetch_lshift(ptr_op, update); + Kokkos::atomic_lshift(ptr_op, update); T old_val = Kokkos::atomic_fetch_lshift(ptr_fetch_op, update); T new_val = Kokkos::atomic_lshift_fetch(ptr_op_fetch, update); return Kokkos::pair<T, T>(old_val, new_val); @@ -251,8 +248,7 @@ struct RShiftAtomicTest { template <class T> KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, T* ptr_op_fetch, T update) { - // Kokkos::atomic_rshift(ptr_op, update); not implemented - (void)Kokkos::atomic_fetch_rshift(ptr_op, update); + Kokkos::atomic_rshift(ptr_op, update); T old_val = Kokkos::atomic_fetch_rshift(ptr_fetch_op, update); T new_val = Kokkos::atomic_rshift_fetch(ptr_op_fetch, update); return Kokkos::pair<T, T>(old_val, new_val); @@ -281,6 +277,63 @@ struct LoadStoreAtomicTest { static const char* name() { return "load/store"; } }; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +struct DeprecatedAssignAtomicTest { + template <class T> + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T update) { + T old_val = Kokkos::atomic_load(ptr_op); + Kokkos::atomic_assign(ptr_op, update); + Kokkos::atomic_assign(ptr_op_fetch, update); + Kokkos::atomic_assign(ptr_fetch_op, update); + return Kokkos::pair<T, T>(old_val, update); + } + template <class T> + KOKKOS_FUNCTION static T op(T, T update) { + return update; + } + static const char* name() { return "load/assign"; } +}; + +struct DeprecatedIncrementAtomicTest { + template <class T> + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T) { + Kokkos::atomic_increment(ptr_op); + T old_val = Kokkos::atomic_fetch_inc(ptr_fetch_op); + T new_val = Kokkos::atomic_inc_fetch(ptr_op_fetch); + return Kokkos::pair<T, T>(old_val, new_val); + } + template <class T> + KOKKOS_FUNCTION static T op(T old, T) { + return old + 1; + } + static const char* name() { return "increment"; } +}; + +struct DeprecatedDecrementAtomicTest { + template <class T> + KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, + T* ptr_op_fetch, T) { + Kokkos::atomic_decrement(ptr_op); + T old_val = Kokkos::atomic_fetch_dec(ptr_fetch_op); + T new_val = Kokkos::atomic_dec_fetch(ptr_op_fetch); + return Kokkos::pair<T, T>(old_val, new_val); + } + template <class T> + KOKKOS_FUNCTION static T op(T old, T) { + return old - 1; + } + static const char* name() { return "decrement"; } +}; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif +#endif + struct IncModAtomicTest { template <class T> KOKKOS_FUNCTION static auto atomic_op(T* ptr_op, T* ptr_fetch_op, @@ -351,19 +404,82 @@ bool atomic_op_test(T old_val, T update) { }, result); if ((result & 1) != 0) - printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + std::cerr << "atomic_" << Op::name() << " failed with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; + if ((result & 2) != 0) + std::cerr << "atomic_fetch_" << Op::name() << " failed with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; + if ((result & 4) != 0) + std::cerr << "atomic_" << Op::name() << "_fetch failed with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; + if ((result & 8) != 0) + std::cerr << "atomic_fetch_" << Op::name() + << " did not return old value with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; + if ((result & 16) != 0) + std::cerr << "atomic_" << Op::name() << "_fetch" + << " did not return updated value with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; + + return result == 0; +} + +template <class T> +constexpr T relative_error_threshold = T(1.0e-15); + +template <class Op, class T, class ExecSpace> +bool atomic_op_test_rel(T old_val, T update) { + Kokkos::View<T[3], ExecSpace> op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (expected_val == T(0)) { + if (fabs(op_data(0)) > relative_error_threshold<T>) local_result += 1; + if (fabs(op_data(1)) > relative_error_threshold<T>) local_result += 2; + if (fabs(op_data(2)) > relative_error_threshold<T>) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs(fetch_result.second) > relative_error_threshold<T>) + local_result += 16; + } else { + if (fabs((op_data(0) - expected_val) / expected_val) > + relative_error_threshold<T>) + local_result += 1; + if (fabs((op_data(1) - expected_val) / expected_val) > + relative_error_threshold<T>) + local_result += 2; + if (fabs((op_data(2) - expected_val) / expected_val) > + relative_error_threshold<T>) + local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs((fetch_result.second - expected_val) / expected_val) > + relative_error_threshold<T>) + local_result += 16; + } + }, + result); + if ((result & 1) != 0) + std::cerr << "atomic_" << Op::name() << " failed with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; if ((result & 2) != 0) - printf("atomic_fetch_%s failed with type %s\n", Op::name(), - typeid(T).name()); + std::cerr << "atomic_fetch_" << Op::name() << " failed with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; if ((result & 4) != 0) - printf("atomic_%s_fetch failed with type %s\n", Op::name(), - typeid(T).name()); + std::cerr << "atomic_" << Op::name() << "_fetch failed with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; if ((result & 8) != 0) - printf("atomic_fetch_%s did not return old value with type %s\n", - Op::name(), typeid(T).name()); + std::cerr << "atomic_fetch_" << Op::name() + << " did not return old value with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; if ((result & 16) != 0) - printf("atomic_%s_fetch did not return updated value with type %s\n", - Op::name(), typeid(T).name()); + std::cerr << "atomic_" << Op::name() << "_fetch" + << " did not return updated value with type " + << Kokkos::Impl::TypeInfo<T>::name() << '\n'; return result == 0; } @@ -395,20 +511,46 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 9: return atomic_op_test<XorAtomicTest, T, ExecSpace>(old_val, update); case 10: return atomic_op_test<NandAtomicTest, T, ExecSpace>(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC + // compiler bugs, which are reported to NVIDIA. + case 11: return true; + case 12: return true; +#else case 11: - return update_in >= 0 ? atomic_op_test<LShiftAtomicTest, T, ExecSpace>( - old_val, update) - : true; + return (std::make_signed_t<T>(update_in) >= 0 && + std::make_signed_t<T>(old_val) >= 0) + ? atomic_op_test<LShiftAtomicTest, T, ExecSpace>(old_val, + update) + : true; case 12: return update_in >= 0 ? atomic_op_test<RShiftAtomicTest, T, ExecSpace>( old_val, update) : true; +#endif case 13: - return atomic_op_test<IncAtomicTest, T, ExecSpace>(old_val, update); + return +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + atomic_op_test<DeprecatedIncrementAtomicTest, T, ExecSpace>(old_val, + update) && +#endif + atomic_op_test<IncAtomicTest, T, ExecSpace>(old_val, update); case 14: - return atomic_op_test<DecAtomicTest, T, ExecSpace>(old_val, update); + return +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + atomic_op_test<DeprecatedDecrementAtomicTest, T, ExecSpace>(old_val, + update) && +#endif + + atomic_op_test<DecAtomicTest, T, ExecSpace>(old_val, update); case 15: - return atomic_op_test<LoadStoreAtomicTest, T, ExecSpace>(old_val, update); + return +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + atomic_op_test<DeprecatedAssignAtomicTest, T, ExecSpace>(old_val, + update) && +#endif + + atomic_op_test<LoadStoreAtomicTest, T, ExecSpace>(old_val, update); } return true; @@ -440,10 +582,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, case 2: return atomic_op_test<MaxAtomicTest, T, ExecSpace>(old_val, update); case 3: return atomic_op_test<MinAtomicTest, T, ExecSpace>(old_val, update); case 4: return atomic_op_test<MulAtomicTest, T, ExecSpace>(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // NVHPC may use different internal precisions for the device and host + // atomic operations. Therefore, relative errors are used to compare the + // host results and device results. + case 5: + return update != 0 ? atomic_op_test_rel<DivAtomicTest, T, ExecSpace>( + old_val, update) + : true; +#else case 5: return update != 0 ? atomic_op_test<DivAtomicTest, T, ExecSpace>(old_val, update) : true; +#endif case 6: return atomic_op_test<LoadStoreAtomicTest, T, ExecSpace>(old_val, update); } diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp index 5708fd2ebf732a4129683489714499f189c0bb6e..f8bec6705499dd16514e1d7e8738ba7516154c23 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp @@ -22,7 +22,7 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_complexdouble) { #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) GTEST_SKIP() << "skipping since device_global variables are not available"; #endif const int start = -5; diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp index 97bfeea6ad94c18a278bacae5a7a995d917c1798..5e4881c406695d2d643b137cb1e6b051c0b2f3f5 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp @@ -20,6 +20,10 @@ using namespace TestAtomicOperations; namespace Test { TEST(TEST_CATEGORY, atomic_operations_complexfloat) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp index 30f7e5e3bdedcb98b7975a13e7826b4e056eb7a7..12f0c9fa4f3374a6a48ffc74ab44eafd84554acf 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_double) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp index 73ea439808926100f8ab39490a986323fdf31524..7ccda57cb343801dca103cec1485313791532813 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_float) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp index 5aeaecd7af4ce8ad63986dabd46d9604a4419523..59a10eec552bb89b2738c413fbbb81548c2cb597 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_int) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp index b181171dd580ea2165cd9d62c020a7627bd3c44c..5d94ec0eb39a70aa65e6d6b3a29c87ec366b7aa8 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_long) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp index aa21722f474e724867c864013d93f162ec1ad185..589b6a543e1bbfd584f6c5a0944f62c442e16388 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_longlong) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = -5; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_shared.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_shared.hpp index 90b3e23c3c24148d579b9a15d6f9c7877ac7c011..8f817101cda9e2df97859b6cd8c5c839935d3e42 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_shared.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_shared.hpp @@ -40,6 +40,10 @@ struct TestSharedAtomicsFunctor { }; TEST(TEST_CATEGORY, atomic_shared) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif TEST_EXECSPACE exec; Kokkos::View<int, typename TEST_EXECSPACE::memory_space> view("ref_value"); auto team_size = diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp index 96acb94bb16649b1084f28d63fcf42c42b47a35d..1d2c1a8900e8718426ea7a0708710c1d843b4f71 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_unsigned) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = 0; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp index 3482f6fe1ed4f25172e865f2fe2d4fe41f3e8f74..3e2231b68f98a7368cf2ecb705ccb2213e00fc7e 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_unsignedlong) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = 0; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp index cf41dedccb7b0b34c87946ae11a7a1dff5b4fa3d..b18640104564b76fa6076e979190bc4e68f8c9c2 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlonglongint.hpp @@ -18,6 +18,10 @@ namespace Test { TEST(TEST_CATEGORY, atomic_operations_unsignedlonglong) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif const int start = 0; const int end = 11; for (int i = start; i < end; ++i) { diff --git a/packages/kokkos/core/unit_test/TestAtomicViews.hpp b/packages/kokkos/core/unit_test/TestAtomicViews.hpp index 7f50b6dd717c58065eba606e2fb78fe270435e0b..55ff62822bcc4457520b833e0c157bc457735c4f 100644 --- a/packages/kokkos/core/unit_test/TestAtomicViews.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicViews.hpp @@ -16,7 +16,7 @@ #include <Kokkos_Core.hpp> -namespace TestAtomicViews { +namespace { //------------------------------------------------- //-----------atomic view api tests----------------- @@ -124,10 +124,10 @@ class TestAtomicViewAPI { using dView0 = Kokkos::View<T, device>; using dView1 = Kokkos::View<T*, device>; - using dView2 = Kokkos::View<T * [N1], device>; - using dView3 = Kokkos::View<T * [N1][N2], device>; - using dView4 = Kokkos::View<T * [N1][N2][N3], device>; - using const_dView4 = Kokkos::View<const T * [N1][N2][N3], device>; + using dView2 = Kokkos::View<T* [N1], device>; + using dView3 = Kokkos::View<T* [N1][N2], device>; + using dView4 = Kokkos::View<T* [N1][N2][N3], device>; + using const_dView4 = Kokkos::View<const T* [N1][N2][N3], device>; using dView4_unmanaged = Kokkos::View<T****, device, Kokkos::MemoryUnmanaged>; using host = typename dView0::host_mirror_space; @@ -135,12 +135,12 @@ class TestAtomicViewAPI { using aView1 = Kokkos::View<T*, device, Kokkos::MemoryTraits<Kokkos::Atomic> >; using aView2 = - Kokkos::View<T * [N1], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + Kokkos::View<T* [N1], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; using aView3 = - Kokkos::View<T * [N1][N2], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; - using aView4 = Kokkos::View<T * [N1][N2][N3], device, + Kokkos::View<T* [N1][N2], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using aView4 = Kokkos::View<T* [N1][N2][N3], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; - using const_aView4 = Kokkos::View<const T * [N1][N2][N3], device, + using const_aView4 = Kokkos::View<const T* [N1][N2][N3], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; using aView4_unmanaged = @@ -415,21 +415,13 @@ T PlusEqualAtomicViewCheck(const int64_t input_length) { } template <class T, class DeviceType> -bool PlusEqualAtomicViewTest(int64_t input_length) { +void PlusEqualAtomicViewTest(int64_t input_length) { T res = PlusEqualAtomicView<T, DeviceType>(input_length); T resSerial = PlusEqualAtomicViewCheck<T>(input_length); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = PlusEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "PlusEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ")"; } //--------------------------------------------------- @@ -512,21 +504,13 @@ T MinusEqualAtomicViewCheck(const int64_t input_length) { } template <class T, class DeviceType> -bool MinusEqualAtomicViewTest(int64_t input_length) { +void MinusEqualAtomicViewTest(int64_t input_length) { T res = MinusEqualAtomicView<T, DeviceType>(input_length); T resSerial = MinusEqualAtomicViewCheck<T>(input_length); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = MinusEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "MinusEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ")"; } //--------------------------------------------------- @@ -601,22 +585,14 @@ T TimesEqualAtomicViewCheck(const int64_t input_length, } template <class T, class DeviceType> -bool TimesEqualAtomicViewTest(const int64_t input_length) { +void TimesEqualAtomicViewTest(const int64_t input_length) { const int64_t remainder = 23; T res = TimesEqualAtomicView<T, DeviceType>(input_length, remainder); T resSerial = TimesEqualAtomicViewCheck<T>(input_length, remainder); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = TimesEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "TimesEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ")"; } //--------------------------------------------------- @@ -690,23 +666,15 @@ T DivEqualAtomicViewCheck(const int64_t input_length, const int64_t remainder) { } template <class T, class DeviceType> -bool DivEqualAtomicViewTest(const int64_t input_length) { +void DivEqualAtomicViewTest(const int64_t input_length) { const int64_t remainder = 23; T res = DivEqualAtomicView<T, DeviceType>(input_length, remainder); T resSerial = DivEqualAtomicViewCheck<T>(input_length, remainder); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = DivEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "DivEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ",remainder=" << remainder << ")"; } //--------------------------------------------------- @@ -780,8 +748,8 @@ T ModEqualAtomicViewCheck(const int64_t input_length, const int64_t remainder) { } template <class T, class DeviceType> -bool ModEqualAtomicViewTest(const int64_t input_length) { - static_assert(std::is_integral<T>::value, +void ModEqualAtomicViewTest(const int64_t input_length) { + static_assert(std::is_integral_v<T>, "ModEqualAtomicView Error: Type must be integral type for this " "unit test"); @@ -790,17 +758,9 @@ bool ModEqualAtomicViewTest(const int64_t input_length) { T res = ModEqualAtomicView<T, DeviceType>(input_length, remainder); T resSerial = ModEqualAtomicViewCheck<T>(input_length, remainder); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = ModEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "ModEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ",remainder=" << remainder << ")"; } //--------------------------------------------------- @@ -908,8 +868,8 @@ T RSEqualAtomicViewCheck(const int64_t input_length, const int64_t value, } template <class T, class DeviceType> -bool RSEqualAtomicViewTest(const int64_t input_length) { - static_assert(std::is_integral<T>::value, +void RSEqualAtomicViewTest(const int64_t input_length) { + static_assert(std::is_integral_v<T>, "RSEqualAtomicViewTest: Must be integral type for test"); const int64_t remainder = 61042; // prime - 1 @@ -917,17 +877,10 @@ bool RSEqualAtomicViewTest(const int64_t input_length) { T res = RSEqualAtomicView<T, DeviceType>(input_length, value, remainder); T resSerial = RSEqualAtomicViewCheck<T>(input_length, value, remainder); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = RSEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "RSEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ",value=" << value + << ",remainder=" << remainder << ")"; } //--------------------------------------------------- @@ -1035,8 +988,8 @@ T LSEqualAtomicViewCheck(const int64_t input_length, const int64_t value, } template <class T, class DeviceType> -bool LSEqualAtomicViewTest(const int64_t input_length) { - static_assert(std::is_integral<T>::value, +void LSEqualAtomicViewTest(const int64_t input_length) { + static_assert(std::is_integral_v<T>, "LSEqualAtomicViewTest: Must be integral type for test"); const int64_t remainder = 61042; // prime - 1 @@ -1044,17 +997,10 @@ bool LSEqualAtomicViewTest(const int64_t input_length) { T res = LSEqualAtomicView<T, DeviceType>(input_length, value, remainder); T resSerial = LSEqualAtomicViewCheck<T>(input_length, value, remainder); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = RSEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "LSEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ",value=" << value + << ",remainder=" << remainder << ")"; } //--------------------------------------------------- @@ -1119,35 +1065,23 @@ T AndEqualAtomicViewCheck(const int64_t input_length) { const int64_t N = input_length; T result[2] = {1}; for (int64_t i = 0; i < N; ++i) { - if (N % 2 == 0) { - result[0] &= (T)i; - } else { - result[1] &= (T)i; - } + int64_t idx = N % 2; + result[idx] &= (T)i; } - return (result[0]); } template <class T, class DeviceType> -bool AndEqualAtomicViewTest(int64_t input_length) { - static_assert(std::is_integral<T>::value, +void AndEqualAtomicViewTest(int64_t input_length) { + static_assert(std::is_integral_v<T>, "AndEqualAtomicViewTest: Must be integral type for test"); T res = AndEqualAtomicView<T, DeviceType>(input_length); T resSerial = AndEqualAtomicViewCheck<T>(input_length); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = AndEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "AndEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ")"; } //--------------------------------------------------- @@ -1222,24 +1156,16 @@ T OrEqualAtomicViewCheck(const int64_t input_length) { } template <class T, class DeviceType> -bool OrEqualAtomicViewTest(int64_t input_length) { - static_assert(std::is_integral<T>::value, +void OrEqualAtomicViewTest(int64_t input_length) { + static_assert(std::is_integral_v<T>, "OrEqualAtomicViewTest: Must be integral type for test"); T res = OrEqualAtomicView<T, DeviceType>(input_length); T resSerial = OrEqualAtomicViewCheck<T>(input_length); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = OrEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "OrEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ")"; } //--------------------------------------------------- @@ -1314,119 +1240,42 @@ T XOrEqualAtomicViewCheck(const int64_t input_length) { } template <class T, class DeviceType> -bool XOrEqualAtomicViewTest(int64_t input_length) { - static_assert(std::is_integral<T>::value, +void XOrEqualAtomicViewTest(int64_t input_length) { + static_assert(std::is_integral_v<T>, "XOrEqualAtomicViewTest: Must be integral type for test"); T res = XOrEqualAtomicView<T, DeviceType>(input_length); T resSerial = XOrEqualAtomicViewCheck<T>(input_length); - bool passed = true; - - if (resSerial != res) { - passed = false; - - std::cout << "Loop<" << typeid(T).name() - << ">( test = XOrEqualAtomicViewTest" - << " FAILED : " << resSerial << " != " << res << std::endl; - } - - return passed; + ASSERT_EQ(res, resSerial) + << "XOrEqualAtomicViewTest<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(length=" << input_length << ")"; } // inc/dec? -//--------------------------------------------------- -//--------------atomic_test_control------------------ -//--------------------------------------------------- - -template <class T, class DeviceType> -bool AtomicViewsTestIntegralType(const int length, int test) { - static_assert(std::is_integral<T>::value, - "TestAtomicViews Error: Non-integral type passed into " - "IntegralType tests"); - - switch (test) { - case 1: return PlusEqualAtomicViewTest<T, DeviceType>(length); - case 2: return MinusEqualAtomicViewTest<T, DeviceType>(length); - case 3: return RSEqualAtomicViewTest<T, DeviceType>(length); - case 4: return LSEqualAtomicViewTest<T, DeviceType>(length); - case 5: return ModEqualAtomicViewTest<T, DeviceType>(length); - case 6: return AndEqualAtomicViewTest<T, DeviceType>(length); - case 7: return OrEqualAtomicViewTest<T, DeviceType>(length); - case 8: return XOrEqualAtomicViewTest<T, DeviceType>(length); - } - - return 0; -} - -template <class T, class DeviceType> -bool AtomicViewsTestNonIntegralType(const int length, int test) { - switch (test) { - case 1: return PlusEqualAtomicViewTest<T, DeviceType>(length); - case 2: return MinusEqualAtomicViewTest<T, DeviceType>(length); - case 3: return TimesEqualAtomicViewTest<T, DeviceType>(length); - case 4: return DivEqualAtomicViewTest<T, DeviceType>(length); - } - - return 0; -} - -} // namespace TestAtomicViews - -namespace Test { - TEST(TEST_CATEGORY, atomic_views_integral) { const int64_t length = 1000000; - { - // Integral Types. - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 1))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 2))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 3))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 4))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 5))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 6))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 7))); - ASSERT_TRUE( - (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( - length, 8))); - } + PlusEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + MinusEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + RSEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + LSEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + ModEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + AndEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + OrEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); + XOrEqualAtomicViewTest<int64_t, TEST_EXECSPACE>(length); } TEST(TEST_CATEGORY, atomic_views_nonintegral) { const int64_t length = 1000000; - { - // Non-Integral Types. - ASSERT_TRUE(( - TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( - length, 1))); - ASSERT_TRUE(( - TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( - length, 2))); - ASSERT_TRUE(( - TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( - length, 3))); - ASSERT_TRUE(( - TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( - length, 4))); - } + PlusEqualAtomicViewTest<double, TEST_EXECSPACE>(length); + MinusEqualAtomicViewTest<double, TEST_EXECSPACE>(length); + TimesEqualAtomicViewTest<double, TEST_EXECSPACE>(length); + DivEqualAtomicViewTest<double, TEST_EXECSPACE>(length); } TEST(TEST_CATEGORY, atomic_view_api) { - TestAtomicViews::TestAtomicViewAPI<int, TEST_EXECSPACE>(); + TestAtomicViewAPI<int, TEST_EXECSPACE>(); } -} // namespace Test + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp index 2b40f12d0a4deaf3f970697c6e688208b20f9570..1a94ce4a4d7c88f634099fa91dddf11801352541 100644 --- a/packages/kokkos/core/unit_test/TestAtomics.hpp +++ b/packages/kokkos/core/unit_test/TestAtomics.hpp @@ -16,7 +16,7 @@ #include <Kokkos_Core.hpp> -namespace TestAtomic { +namespace { // Struct for testing arbitrary size atomics. @@ -157,17 +157,6 @@ struct AddFunctor { void operator()(int) const { Kokkos::atomic_fetch_add(&data(), (T)1); } }; -template <class T, class DEVICE_TYPE> -struct AddFunctorReduce { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View<T, execution_space>; - - type data; - - KOKKOS_INLINE_FUNCTION - void operator()(int, int&) const { Kokkos::atomic_fetch_add(&data(), (T)1); } -}; - template <class T, class execution_space> T AddLoop(int loop) { struct ZeroFunctor<T, execution_space> f_zero; @@ -188,12 +177,6 @@ T AddLoop(int loop) { Kokkos::deep_copy(h_data, data); T val = h_data(); - struct AddFunctorReduce<T, execution_space> f_add_red; - f_add_red.data = data; - int dummy_result; - Kokkos::parallel_reduce(loop, f_add_red, dummy_result); - execution_space().fence(); - return val; } @@ -236,26 +219,6 @@ struct CASFunctor { } }; -template <class T, class DEVICE_TYPE> -struct CASFunctorReduce { - using execution_space = DEVICE_TYPE; - using type = Kokkos::View<T, execution_space>; - - type data; - - KOKKOS_INLINE_FUNCTION - void operator()(int, int&) const { - T old = data(); - T newval, assumed; - - do { - assumed = old; - newval = assumed + (T)1; - old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); - } while (old != assumed); - } -}; - template <class T, class execution_space> T CASLoop(int loop) { struct ZeroFunctor<T, execution_space> f_zero; @@ -274,12 +237,6 @@ T CASLoop(int loop) { Kokkos::deep_copy(h_data, data); T val = h_data(); - struct CASFunctorReduce<T, execution_space> f_cas_red; - f_cas_red.data = data; - int dummy_result; - Kokkos::parallel_reduce(loop, f_cas_red, dummy_result); - execution_space().fence(); - return val; } @@ -308,32 +265,70 @@ T CASLoopSerial(int loop) { } //---------------------------------------------- -//--------------atomic_exchange----------------- +//--------------atomic_compare_exchange_strong-- //---------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif template <class T, class DEVICE_TYPE> -struct ExchFunctor { +struct DeprecatedCASFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View<T, execution_space>; - type data, data2; + type data; KOKKOS_INLINE_FUNCTION - void operator()(int i) const { - T old = Kokkos::atomic_exchange(&data(), (T)i); - Kokkos::atomic_fetch_add(&data2(), old); + void operator()(int) const { + T newval, assumed; + + do { + assumed = Kokkos::volatile_load(&data()); + newval = assumed + (T)1; + } while (!Kokkos::atomic_compare_exchange_strong(&data(), assumed, newval)); } }; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + +template <class T, class execution_space> +T DeprecatedCASLoop(int loop) { + struct ZeroFunctor<T, execution_space> f_zero; + typename ZeroFunctor<T, execution_space>::type data("Data"); + typename ZeroFunctor<T, execution_space>::h_type h_data("HData"); + + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + execution_space().fence(); + + struct DeprecatedCASFunctor<T, execution_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop, f_cas); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +#endif + +//---------------------------------------------- +//--------------atomic_exchange----------------- +//---------------------------------------------- template <class T, class DEVICE_TYPE> -struct ExchFunctorReduce { +struct ExchFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View<T, execution_space>; type data, data2; KOKKOS_INLINE_FUNCTION - void operator()(int i, int&) const { + void operator()(int i) const { T old = Kokkos::atomic_exchange(&data(), (T)i); Kokkos::atomic_fetch_add(&data2(), old); } @@ -366,20 +361,13 @@ T ExchLoop(int loop) { Kokkos::deep_copy(h_data2, data2); T val = h_data() + h_data2(); - struct ExchFunctorReduce<T, execution_space> f_exch_red; - f_exch_red.data = data; - f_exch_red.data2 = data2; - int dummy_result; - Kokkos::parallel_reduce(loop, f_exch_red, dummy_result); - execution_space().fence(); - return val; } template <class T> -T ExchLoopSerial(std::conditional_t< - !std::is_same<T, Kokkos::complex<double> >::value, int, void> - loop) { +T ExchLoopSerial( + std::conditional_t<!std::is_same_v<T, Kokkos::complex<double> >, int, void> + loop) { T* data = new T[1]; T* data2 = new T[1]; data[0] = 0; @@ -399,9 +387,9 @@ T ExchLoopSerial(std::conditional_t< } template <class T> -T ExchLoopSerial(std::conditional_t< - std::is_same<T, Kokkos::complex<double> >::value, int, void> - loop) { +T ExchLoopSerial( + std::conditional_t<std::is_same_v<T, Kokkos::complex<double> >, int, void> + loop) { T* data = new T[1]; T* data2 = new T[1]; data[0] = 0; @@ -427,8 +415,12 @@ T LoopVariant(int loop, int test) { case 1: return AddLoop<T, DeviceType>(loop); case 2: return CASLoop<T, DeviceType>(loop); case 3: return ExchLoop<T, DeviceType>(loop); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + case 4: return DeprecatedCASLoop<T, DeviceType>(loop); +#endif } + Kokkos::abort("unreachable"); return 0; } @@ -438,106 +430,121 @@ T LoopVariantSerial(int loop, int test) { case 1: return AddLoopSerial<T>(loop); case 2: return CASLoopSerial<T>(loop); case 3: return ExchLoopSerial<T>(loop); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + case 4: return CASLoopSerial<T>(loop); +#endif } + Kokkos::abort("unreachable"); return 0; } template <class T, class DeviceType> -bool Loop(int loop, int test) { +void Loop(int loop, int test) { T res = LoopVariant<T, DeviceType>(loop, test); T resSerial = LoopVariantSerial<T>(loop, test); - bool passed = true; + ASSERT_EQ(res, resSerial) << "Loop<" << Kokkos::Impl::TypeInfo<T>::name() + << ">(loop=" << loop << ",test=" << test << ")"; +} - if (resSerial != res) { - passed = false; +TEST(TEST_CATEGORY, atomics) { + const int loop_count = 1e4; - std::cout << "Loop<" << typeid(T).name() << ">( test = " << test - << " FAILED : " << resSerial << " != " << res << std::endl; - } + Loop<int, TEST_EXECSPACE>(loop_count, 1); + Loop<int, TEST_EXECSPACE>(loop_count, 2); + Loop<int, TEST_EXECSPACE>(loop_count, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<int, TEST_EXECSPACE>(loop_count, 4); +#endif - return passed; -} + Loop<unsigned int, TEST_EXECSPACE>(loop_count, 1); + Loop<unsigned int, TEST_EXECSPACE>(loop_count, 2); + Loop<unsigned int, TEST_EXECSPACE>(loop_count, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<unsigned int, TEST_EXECSPACE>(loop_count, 4); +#endif -} // namespace TestAtomic + Loop<long int, TEST_EXECSPACE>(loop_count, 1); + Loop<long int, TEST_EXECSPACE>(loop_count, 2); + Loop<long int, TEST_EXECSPACE>(loop_count, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<long int, TEST_EXECSPACE>(loop_count, 4); +#endif -namespace Test { + Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 1); + Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 2); + Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 4); +#endif -TEST(TEST_CATEGORY, atomics) { - const int loop_count = 1e4; + Loop<long long int, TEST_EXECSPACE>(loop_count, 1); + Loop<long long int, TEST_EXECSPACE>(loop_count, 2); + Loop<long long int, TEST_EXECSPACE>(loop_count, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<long long int, TEST_EXECSPACE>(loop_count, 4); +#endif + + Loop<double, TEST_EXECSPACE>(loop_count, 1); + Loop<double, TEST_EXECSPACE>(loop_count, 2); + Loop<double, TEST_EXECSPACE>(loop_count, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<double, TEST_EXECSPACE>(loop_count, 4); +#endif + + Loop<float, TEST_EXECSPACE>(100, 1); + Loop<float, TEST_EXECSPACE>(100, 2); + Loop<float, TEST_EXECSPACE>(100, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<float, TEST_EXECSPACE>(100, 4); +#endif + + // FIXME_OPENMPTARGET + // FIXME_OPENACC: atomic operations on composite types are not supported. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 1); + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 2); + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 4); +#endif - ASSERT_TRUE((TestAtomic::Loop<int, TEST_EXECSPACE>(loop_count, 1))); - ASSERT_TRUE((TestAtomic::Loop<int, TEST_EXECSPACE>(loop_count, 2))); - ASSERT_TRUE((TestAtomic::Loop<int, TEST_EXECSPACE>(loop_count, 3))); - - ASSERT_TRUE((TestAtomic::Loop<unsigned int, TEST_EXECSPACE>(loop_count, 1))); - ASSERT_TRUE((TestAtomic::Loop<unsigned int, TEST_EXECSPACE>(loop_count, 2))); - ASSERT_TRUE((TestAtomic::Loop<unsigned int, TEST_EXECSPACE>(loop_count, 3))); - - ASSERT_TRUE((TestAtomic::Loop<long int, TEST_EXECSPACE>(loop_count, 1))); - ASSERT_TRUE((TestAtomic::Loop<long int, TEST_EXECSPACE>(loop_count, 2))); - ASSERT_TRUE((TestAtomic::Loop<long int, TEST_EXECSPACE>(loop_count, 3))); - - ASSERT_TRUE( - (TestAtomic::Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 1))); - ASSERT_TRUE( - (TestAtomic::Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 2))); - ASSERT_TRUE( - (TestAtomic::Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 3))); - - ASSERT_TRUE((TestAtomic::Loop<long long int, TEST_EXECSPACE>(loop_count, 1))); - ASSERT_TRUE((TestAtomic::Loop<long long int, TEST_EXECSPACE>(loop_count, 2))); - ASSERT_TRUE((TestAtomic::Loop<long long int, TEST_EXECSPACE>(loop_count, 3))); - - ASSERT_TRUE((TestAtomic::Loop<double, TEST_EXECSPACE>(loop_count, 1))); - ASSERT_TRUE((TestAtomic::Loop<double, TEST_EXECSPACE>(loop_count, 2))); - ASSERT_TRUE((TestAtomic::Loop<double, TEST_EXECSPACE>(loop_count, 3))); - - ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 1))); - ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 2))); - ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 3))); - -#ifndef KOKKOS_ENABLE_OPENMPTARGET - ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 1))); - ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 2))); - ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 3))); - - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 1))); - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 2))); - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 3))); + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 1); + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 2); + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 4); +#endif // FIXME_SYCL Replace macro by SYCL_EXT_ONEAPI_DEVICE_GLOBAL or remove // condition alltogether when possible. #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) return; + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) return; +#endif + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1); + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 2); + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 4); +#endif + + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 1); + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 2); + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 4); #endif - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1))); - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 2))); - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 3))); - - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 1))); - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 2))); - ASSERT_TRUE( - (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 3))); // WORKAROUND MSVC #ifndef _WIN32 - ASSERT_TRUE( - (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 1))); - ASSERT_TRUE( - (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 2))); - ASSERT_TRUE( - (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 3))); + Loop<SuperScalar<4>, TEST_EXECSPACE>(100, 1); + Loop<SuperScalar<4>, TEST_EXECSPACE>(100, 2); + Loop<SuperScalar<4>, TEST_EXECSPACE>(100, 3); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Loop<SuperScalar<4>, TEST_EXECSPACE>(100, 4); +#endif #endif #endif } @@ -585,4 +592,4 @@ struct TpetraUseCase { TEST(TEST_CATEGORY, atomics_tpetra_max_abs) { TpetraUseCase().check(); } -} // namespace Test +} // namespace diff --git a/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp b/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp index 092e7cff6180814f73279e91830e33fa619d4134..dbe1d8d1f5fae5f86e14c74e13a2efa80f3e3e99 100644 --- a/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp @@ -228,7 +228,7 @@ TEST(TEST_CATEGORY, bit_manip_countr_zero) { #endif #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOS_ARCH_INTEL_GPU) // FIXME_SYCL returns wrong result - if (!std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) + if (!std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) #endif test_bit_manip_countr_zero<unsigned char>(); test_bit_manip_countr_zero<unsigned short>(); @@ -274,7 +274,7 @@ TEST(TEST_CATEGORY, bit_manip_countr_one) { #endif #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOS_ARCH_INTEL_GPU) // FIXME_SYCL returns wrong result - if (!std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) + if (!std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) #endif test_bit_manip_countr_one<unsigned char>(); test_bit_manip_countr_one<unsigned short>(); @@ -804,35 +804,29 @@ struct TestBitCastFunction { using Kokkos::bit_cast; if (bit_cast<int>(123) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + Kokkos::printf("failed check #1\n"); } if (bit_cast<int>(123u) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + Kokkos::printf("failed check #2\n"); } if (bit_cast<int>(~0u) != ~0) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + Kokkos::printf("failed check #3\n"); } if constexpr (sizeof(int) == sizeof(float)) { if (!check<int>(12.34f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + Kokkos::printf("failed check #4\n"); } } if constexpr (sizeof(unsigned long long) == sizeof(double)) { if (!check<unsigned long long>(123.456)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + Kokkos::printf("failed check #5\n"); } } -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if constexpr (std::is_same_v<Space, Kokkos::Cuda>) { - return; - } -#endif struct S { int i; @@ -848,11 +842,11 @@ struct TestBitCastFunction { } if (!(bit_cast<S>(arr) == arr)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + Kokkos::printf("failed check #6\n"); } if (!(bit_cast<S>(arr2) == arr2)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + Kokkos::printf("failed check #7\n"); } } }; diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/packages/kokkos/core/unit_test/TestCStyleMemoryManagement.cpp similarity index 73% rename from packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp rename to packages/kokkos/core/unit_test/TestCStyleMemoryManagement.cpp index 2bb378d294c4e3285692e1211a8d1328a438a93e..7cd16d697f0fbd39215f41175369dc589ddda7e7 100644 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp +++ b/packages/kokkos/core/unit_test/TestCStyleMemoryManagement.cpp @@ -14,28 +14,29 @@ // //@HEADER -#include <gtest/gtest.h> - #include <Kokkos_Core.hpp> -#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) - #include <TestDefaultDeviceType_Category.hpp> -#include <TestUtilities.hpp> -namespace Test { +#include <gtest/gtest.h> + +namespace { -TEST(defaultdevicetype, malloc) { +TEST(defaultdevicetype, c_style_memory_management_malloc_realloc_and_free) { int* data = static_cast<int*>(Kokkos::kokkos_malloc(100 * sizeof(int))); - ASSERT_NO_THROW(data = static_cast<int*>( - Kokkos::kokkos_realloc(data, 120 * sizeof(int)))); + ASSERT_NE(data, nullptr); + + data = static_cast<int*>(Kokkos::kokkos_realloc(data, 120 * sizeof(int))); + ASSERT_NE(data, nullptr); + Kokkos::kokkos_free(data); +} +TEST(defaultdevicetype, c_style_memory_management_malloc_zero_byte_and_free) { int* data2 = static_cast<int*>(Kokkos::kokkos_malloc(0)); ASSERT_EQ(data2, nullptr); + Kokkos::kokkos_free(data2); } -} // namespace Test - -#endif +} // namespace diff --git a/packages/kokkos/core/unit_test/TestCTestDevice.cpp b/packages/kokkos/core/unit_test/TestCTestDevice.cpp index 5be4714f9eaf9e60a2d3d491954589097f17b9d8..134dac4e63aa3454e3fd8d8745573d57575e10fa 100644 --- a/packages/kokkos/core/unit_test/TestCTestDevice.cpp +++ b/packages/kokkos/core/unit_test/TestCTestDevice.cpp @@ -32,19 +32,9 @@ int setenv(const char *name, const char *value, int overwrite) { int unsetenv(const char *name) { return _putenv_s(name, ""); } #endif -// Needed because https://github.com/google/googletest/issues/952 has not been -// resolved -#define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) \ - EXPECT_THROW( \ - try { stmt; } catch (const etype &ex) { \ - EXPECT_EQ(std::string(ex.what()).find(whatstring), 0u); \ - throw; \ - }, \ - etype) - class ctest_environment : public ::testing::Test { protected: - void SetUp(); + void SetUp() override; }; void ctest_environment::SetUp() { @@ -76,6 +66,8 @@ void ctest_environment::SetUp() { setenv("CTEST_RESOURCE_GROUP_9_GPUS", "id:4,slots:1", 1); } +struct ctest_environment_DeathTest : public ctest_environment {}; + TEST_F(ctest_environment, no_device_type) { unsetenv("CTEST_KOKKOS_DEVICE_TYPE"); EXPECT_EQ(Kokkos::Impl::get_ctest_gpu(0), 0); @@ -86,47 +78,47 @@ TEST_F(ctest_environment, no_process_count) { EXPECT_EQ(Kokkos::Impl::get_ctest_gpu(0), 0); } -TEST_F(ctest_environment, invalid_rank) { - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(10), std::runtime_error, - "Error: local rank 10 is outside the bounds of resource groups provided " - "by CTest."); +TEST_F(ctest_environment_DeathTest, invalid_rank) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH(Kokkos::Impl::get_ctest_gpu(10), + "Error: local rank 10 is outside the bounds of resource groups " + "provided by CTest."); } -TEST_F(ctest_environment, no_type_str) { - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(0), std::runtime_error, - "Error: CTEST_RESOURCE_GROUP_0 is not specified. Raised by " - "Kokkos::Impl::get_ctest_gpu()."); +TEST_F(ctest_environment_DeathTest, no_type_str) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH(Kokkos::Impl::get_ctest_gpu(0), + "Error: CTEST_RESOURCE_GROUP_0 is not specified. Raised by " + "Kokkos::Impl::get_ctest_gpu\\(\\)."); } -TEST_F(ctest_environment, missing_type) { - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(1), std::runtime_error, +TEST_F(ctest_environment_DeathTest, missing_type) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + Kokkos::Impl::get_ctest_gpu(1), "Error: device type 'gpus' not included in CTEST_RESOURCE_GROUP_1. " - "Raised by Kokkos::Impl::get_ctest_gpu()."); - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(2), std::runtime_error, + "Raised by Kokkos::Impl::get_ctest_gpu\\(\\)."); + EXPECT_DEATH( + Kokkos::Impl::get_ctest_gpu(2), "Error: device type 'gpus' not included in CTEST_RESOURCE_GROUP_2. " - "Raised by Kokkos::Impl::get_ctest_gpu()."); + "Raised by Kokkos::Impl::get_ctest_gpu\\(\\)."); } -TEST_F(ctest_environment, no_id_str) { - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(3), std::runtime_error, - "Error: CTEST_RESOURCE_GROUP_3_GPUS is not specified. Raised by " - "Kokkos::Impl::get_ctest_gpu()."); +TEST_F(ctest_environment_DeathTest, no_id_str) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH(Kokkos::Impl::get_ctest_gpu(3), + "Error: CTEST_RESOURCE_GROUP_3_GPUS is not specified. Raised by " + "Kokkos::Impl::get_ctest_gpu\\(\\)."); } -TEST_F(ctest_environment, invalid_id_str) { - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(4), std::runtime_error, - "Error: invalid value of CTEST_RESOURCE_GROUP_4_GPUS: 'id:2'. Raised by " - "Kokkos::Impl::get_ctest_gpu()."); - EXPECT_THROW_WITH_MESSAGE( - Kokkos::Impl::get_ctest_gpu(5), std::runtime_error, - "Error: invalid value of CTEST_RESOURCE_GROUP_5_GPUS: 'slots:1,id:2'. " - "Raised by Kokkos::Impl::get_ctest_gpu()."); +TEST_F(ctest_environment_DeathTest, invalid_id_str) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH(Kokkos::Impl::get_ctest_gpu(4), + "Error: invalid value of CTEST_RESOURCE_GROUP_4_GPUS: 'id:2'. " + "Raised by Kokkos::Impl::get_ctest_gpu\\(\\)."); + EXPECT_DEATH(Kokkos::Impl::get_ctest_gpu(5), + "Error: invalid value of CTEST_RESOURCE_GROUP_5_GPUS: " + "'slots:1,id:2'. Raised by Kokkos::Impl::get_ctest_gpu\\(\\)."); } TEST_F(ctest_environment, good) { diff --git a/packages/kokkos/core/unit_test/TestCXX11.hpp b/packages/kokkos/core/unit_test/TestCXX11.hpp index b35de9a3a04fde59a9591aee190fe6c0c960343b..5029b028f4b5e10cb54ab2d9fa4fdc45f75360fe 100644 --- a/packages/kokkos/core/unit_test/TestCXX11.hpp +++ b/packages/kokkos/core/unit_test/TestCXX11.hpp @@ -87,7 +87,6 @@ double AddTestFunctor() { return result; } -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) template <class DeviceType, bool PWRTest> double AddTestLambda() { Kokkos::View<double**, DeviceType> a("A", 100, 5); @@ -144,12 +143,6 @@ double AddTestLambda() { return result; } -#else -template <class DeviceType, bool PWRTest> -double AddTestLambda() { - return AddTestFunctor<DeviceType, PWRTest>(); -} -#endif template <class DeviceType> struct FunctorReduceTest { @@ -224,7 +217,6 @@ double ReduceTestFunctor() { return result; } -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) template <class DeviceType, bool PWRTest> double ReduceTestLambda() { using policy_type = Kokkos::TeamPolicy<DeviceType>; @@ -277,12 +269,6 @@ double ReduceTestLambda() { return result; } -#else -template <class DeviceType, bool PWRTest> -double ReduceTestLambda() { - return ReduceTestFunctor<DeviceType, PWRTest>(); -} -#endif template <class DeviceType> double TestVariantLambda(int test) { @@ -310,7 +296,6 @@ double TestVariantFunctor(int test) { template <class DeviceType> bool Test(int test) { -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA double res_functor = TestVariantFunctor<DeviceType>(test); double res_lambda = TestVariantLambda<DeviceType>(test); @@ -333,17 +318,13 @@ bool Test(int test) { } return passed; -#else - (void)test; - return true; -#endif } } // namespace TestCXX11 namespace Test { TEST(TEST_CATEGORY, cxx11) { - if (std::is_same<Kokkos::DefaultExecutionSpace, TEST_EXECSPACE>::value) { + if (std::is_same_v<Kokkos::DefaultExecutionSpace, TEST_EXECSPACE>) { ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(1))); ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(2))); ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(3))); diff --git a/packages/kokkos/core/unit_test/TestCompilerMacros.cpp b/packages/kokkos/core/unit_test/TestCompilerMacros.cpp index 63b368b23ee16ee6ca3d34a475330f495e823ef4..5b464f043f1f781ac3450d23182fc9b9ad08e1e0 100644 --- a/packages/kokkos/core/unit_test/TestCompilerMacros.cpp +++ b/packages/kokkos/core/unit_test/TestCompilerMacros.cpp @@ -30,14 +30,12 @@ #endif #if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined" +#error "Macro bug: KOKKOS_ENABLE_CUDA_LAMBDA should be defined" #endif -#else + #if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA should be defined" #endif -#endif namespace TestCompilerMacros { diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp index bcae2e1d81600d929365d4fea34d220d29c893ea..185df52a50b9489b76c19af5f211d62a213e41be 100644 --- a/packages/kokkos/core/unit_test/TestComplex.hpp +++ b/packages/kokkos/core/unit_test/TestComplex.hpp @@ -14,10 +14,31 @@ // //@HEADER +#include <Kokkos_Macros.hpp> + +// Suppress "'long double' is treated as 'double' in device code" +// The suppression needs to happen before Kokkos_Complex.hpp is included to be +// effective +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 20208 +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic push +#pragma diag_suppress 3245 +#endif +#endif +#endif + #include <Kokkos_Core.hpp> -#include <cstdio> #include <sstream> +namespace { +template <typename... Ts> +KOKKOS_FUNCTION constexpr void maybe_unused(Ts &&...) noexcept {} +} // namespace + namespace Test { // Test construction and assignment @@ -400,8 +421,8 @@ TEST(TEST_CATEGORY, complex_trivially_copyable) { using RealType = double; // clang claims compatibility with gcc 4.2.1 but all versions tested know // about std::is_trivially_copyable. - ASSERT_TRUE(std::is_trivially_copyable<Kokkos::complex<RealType>>::value || - !std::is_trivially_copyable<RealType>::value); + ASSERT_TRUE(std::is_trivially_copyable_v<Kokkos::complex<RealType>> || + !std::is_trivially_copyable_v<RealType>); } template <class ExecSpace> @@ -451,17 +472,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) { ASSERT_FLOAT_EQ(x.real(), y.real()); ASSERT_FLOAT_EQ(x.imag(), y.imag()); -#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ - static_assert( \ - std::is_same<RETURNTYPE, \ - decltype(Kokkos::pow(std::declval<ARGTYPE1>(), \ - std::declval<ARGTYPE2>()))>::value, \ - ""); \ - static_assert( \ - std::is_same<RETURNTYPE, \ - decltype(Kokkos::pow(std::declval<ARGTYPE2>(), \ - std::declval<ARGTYPE1>()))>::value, \ - ""); +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same<RETURNTYPE, \ + decltype(Kokkos::pow(std::declval<ARGTYPE1>(), \ + std::declval<ARGTYPE2>()))>::value); \ + static_assert( \ + std::is_same<RETURNTYPE, \ + decltype(Kokkos::pow(std::declval<ARGTYPE2>(), \ + std::declval<ARGTYPE1>()))>::value); CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, long double, Kokkos::complex<long double>); @@ -499,21 +518,19 @@ TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) { static_assert(Kokkos::real(2.f) == 2.f); static_assert(Kokkos::real(3.) == 3.); static_assert(Kokkos::real(4.l) == 4.l); - static_assert((std::is_same<decltype(Kokkos::real(1)), double>::value)); - static_assert((std::is_same<decltype(Kokkos::real(2.f)), float>::value)); - static_assert((std::is_same<decltype(Kokkos::real(3.)), double>::value)); - static_assert( - (std::is_same<decltype(Kokkos::real(4.l)), long double>::value)); + static_assert((std::is_same_v<decltype(Kokkos::real(1)), double>)); + static_assert((std::is_same_v<decltype(Kokkos::real(2.f)), float>)); + static_assert((std::is_same_v<decltype(Kokkos::real(3.)), double>)); + static_assert((std::is_same_v<decltype(Kokkos::real(4.l)), long double>)); static_assert(Kokkos::imag(1) == 0.); static_assert(Kokkos::imag(2.f) == 0.f); static_assert(Kokkos::imag(3.) == 0.); static_assert(Kokkos::imag(4.l) == 0.l); - static_assert((std::is_same<decltype(Kokkos::imag(1)), double>::value)); - static_assert((std::is_same<decltype(Kokkos::imag(2.f)), float>::value)); - static_assert((std::is_same<decltype(Kokkos::imag(3.)), double>::value)); - static_assert( - (std::is_same<decltype(Kokkos::real(4.l)), long double>::value)); + static_assert((std::is_same_v<decltype(Kokkos::imag(1)), double>)); + static_assert((std::is_same_v<decltype(Kokkos::imag(2.f)), float>)); + static_assert((std::is_same_v<decltype(Kokkos::imag(3.)), double>)); + static_assert((std::is_same_v<decltype(Kokkos::real(4.l)), long double>)); // FIXME in principle could be checked at compile time too ASSERT_EQ(Kokkos::conj(1), Kokkos::complex<double>(1)); @@ -523,15 +540,162 @@ TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) { // power of two. #ifndef KOKKOS_IMPL_32BIT ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex<long double>(4.l)); - static_assert(( - std::is_same<decltype(Kokkos::conj(1)), Kokkos::complex<double>>::value)); + static_assert( + (std::is_same_v<decltype(Kokkos::conj(1)), Kokkos::complex<double>>)); #endif - static_assert((std::is_same<decltype(Kokkos::conj(2.f)), - Kokkos::complex<float>>::value)); - static_assert((std::is_same<decltype(Kokkos::conj(3.)), - Kokkos::complex<double>>::value)); - static_assert((std::is_same<decltype(Kokkos::conj(4.l)), - Kokkos::complex<long double>>::value)); + static_assert( + (std::is_same_v<decltype(Kokkos::conj(2.f)), Kokkos::complex<float>>)); + static_assert( + (std::is_same_v<decltype(Kokkos::conj(3.)), Kokkos::complex<double>>)); + static_assert((std::is_same_v<decltype(Kokkos::conj(4.l)), + Kokkos::complex<long double>>)); +} + +template <class ExecSpace> +struct TestComplexStructuredBindings { + using exec_space = ExecSpace; + using value_type = double; + using complex_type = Kokkos::complex<double>; + using device_view_type = Kokkos::View<complex_type *, exec_space>; + using host_view_type = typename device_view_type::HostMirror; + + device_view_type d_results; + host_view_type h_results; + + // tuple_size + static_assert(std::is_same_v<std::tuple_size<complex_type>::type, + std::integral_constant<size_t, 2>>); + + // tuple_element + static_assert( + std::is_same_v<std::tuple_element_t<0, complex_type>, value_type>); + static_assert( + std::is_same_v<std::tuple_element_t<1, complex_type>, value_type>); + + static void testgetreturnreferencetypes() { + complex_type m; + const complex_type c; + + // get lvalue + complex_type &ml = m; + static_assert(std::is_same_v<decltype(Kokkos::get<0>(ml)), value_type &>); + static_assert(std::is_same_v<decltype(Kokkos::get<1>(ml)), value_type &>); + + // get rvalue + complex_type &&mr = std::move(m); + static_assert( + std::is_same_v<decltype(Kokkos::get<0>(std::move(mr))), value_type &&>); + static_assert( + std::is_same_v<decltype(Kokkos::get<1>(std::move(mr))), value_type &&>); + + // get const lvalue + const complex_type &cl = c; + static_assert( + std::is_same_v<decltype(Kokkos::get<0>(cl)), value_type const &>); + static_assert( + std::is_same_v<decltype(Kokkos::get<1>(cl)), value_type const &>); + + // get const rvalue + complex_type const &&cr = std::move(c); + static_assert(std::is_same_v<decltype(Kokkos::get<0>(std::move(cr))), + value_type const &&>); + static_assert(std::is_same_v<decltype(Kokkos::get<1>(std::move(cr))), + value_type const &&>); + + maybe_unused(m, c, ml, mr, cl, cr); + } + + void testit() { + testgetreturnreferencetypes(); + + d_results = device_view_type("TestComplexStructuredBindings", 6); + h_results = Kokkos::create_mirror_view(d_results); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), *this); + Kokkos::fence(); + Kokkos::deep_copy(h_results, d_results); + + // get lvalue + ASSERT_FLOAT_EQ(h_results[0].real(), 2.); + ASSERT_FLOAT_EQ(h_results[0].imag(), 3.); + + // get rvalue + ASSERT_FLOAT_EQ(h_results[1].real(), 2.); + ASSERT_FLOAT_EQ(h_results[1].imag(), 3.); + + // get const lvalue + ASSERT_FLOAT_EQ(h_results[2].real(), 5.); + ASSERT_FLOAT_EQ(h_results[2].imag(), 7.); + + // get const rvalue + ASSERT_FLOAT_EQ(h_results[3].real(), 5.); + ASSERT_FLOAT_EQ(h_results[3].imag(), 7.); + + // swap real and imaginary + ASSERT_FLOAT_EQ(h_results[4].real(), 11.); + ASSERT_FLOAT_EQ(h_results[4].imag(), 13.); + ASSERT_FLOAT_EQ(h_results[5].real(), 13.); + ASSERT_FLOAT_EQ(h_results[5].imag(), 11.); + } + + KOKKOS_FUNCTION + void operator()(int) const { + complex_type m(2., 3.); + const complex_type c(5., 7.); + + // get lvalue + { + complex_type &ml = m; + auto &[mlr, mli] = ml; + d_results[0] = complex_type(mlr, mli); + } + + // get rvalue + { + complex_type &&mr = std::move(m); + auto &&[mrr, mri] = std::move(mr); + d_results[1] = complex_type(mrr, mri); + } + + // get const lvalue + { + const complex_type &cl = c; + auto &[clr, cli] = cl; + d_results[2] = complex_type(clr, cli); + } + + // get const rvalue + { + complex_type const &&cr = std::move(c); + auto &&[crr, cri] = std::move(cr); + d_results[3] = complex_type(crr, cri); + } + + // swap real and imaginary + { + complex_type z(11., 13.); + d_results[4] = z; + + auto &[zr, zi] = z; + Kokkos::kokkos_swap(zr, zi); + d_results[5] = z; + } + } +}; + +TEST(TEST_CATEGORY, complex_structured_bindings) { + TestComplexStructuredBindings<TEST_EXECSPACE> test; + test.testit(); } } // namespace Test + +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic pop +#endif +#endif +#endif diff --git a/packages/kokkos/core/unit_test/TestConcepts.hpp b/packages/kokkos/core/unit_test/TestConcepts.hpp index 476a8848325c0375cecaadffc773e623ac17f997..6ad370b72f323fa9e1f8f27da4c09393f8daf991 100644 --- a/packages/kokkos/core/unit_test/TestConcepts.hpp +++ b/packages/kokkos/core/unit_test/TestConcepts.hpp @@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; using DeviceType = typename ExecutionSpace::device_type; -static_assert(Kokkos::is_execution_space<ExecutionSpace>{}, ""); -static_assert(Kokkos::is_execution_space<ExecutionSpace const>{}, ""); -static_assert(!Kokkos::is_execution_space<ExecutionSpace &>{}, ""); -static_assert(!Kokkos::is_execution_space<ExecutionSpace const &>{}, ""); - -static_assert(Kokkos::is_memory_space<MemorySpace>{}, ""); -static_assert(Kokkos::is_memory_space<MemorySpace const>{}, ""); -static_assert(!Kokkos::is_memory_space<MemorySpace &>{}, ""); -static_assert(!Kokkos::is_memory_space<MemorySpace const &>{}, ""); - -static_assert(Kokkos::is_device<DeviceType>{}, ""); -static_assert(Kokkos::is_device<DeviceType const>{}, ""); -static_assert(!Kokkos::is_device<DeviceType &>{}, ""); -static_assert(!Kokkos::is_device<DeviceType const &>{}, ""); - -static_assert(!Kokkos::is_device<ExecutionSpace>{}, ""); -static_assert(!Kokkos::is_device<MemorySpace>{}, ""); - -static_assert(Kokkos::is_space<ExecutionSpace>{}, ""); -static_assert(Kokkos::is_space<MemorySpace>{}, ""); -static_assert(Kokkos::is_space<DeviceType>{}, ""); -static_assert(Kokkos::is_space<ExecutionSpace const>{}, ""); -static_assert(Kokkos::is_space<MemorySpace const>{}, ""); -static_assert(Kokkos::is_space<DeviceType const>{}, ""); -static_assert(!Kokkos::is_space<ExecutionSpace &>{}, ""); -static_assert(!Kokkos::is_space<MemorySpace &>{}, ""); -static_assert(!Kokkos::is_space<DeviceType &>{}, ""); - -static_assert(Kokkos::is_execution_space_v<ExecutionSpace>, ""); -static_assert(!Kokkos::is_execution_space_v<ExecutionSpace &>, ""); +static_assert(Kokkos::is_execution_space<ExecutionSpace>{}); +static_assert(Kokkos::is_execution_space<ExecutionSpace const>{}); +static_assert(!Kokkos::is_execution_space<ExecutionSpace &>{}); +static_assert(!Kokkos::is_execution_space<ExecutionSpace const &>{}); + +static_assert(Kokkos::is_memory_space<MemorySpace>{}); +static_assert(Kokkos::is_memory_space<MemorySpace const>{}); +static_assert(!Kokkos::is_memory_space<MemorySpace &>{}); +static_assert(!Kokkos::is_memory_space<MemorySpace const &>{}); + +static_assert(Kokkos::is_device<DeviceType>{}); +static_assert(Kokkos::is_device<DeviceType const>{}); +static_assert(!Kokkos::is_device<DeviceType &>{}); +static_assert(!Kokkos::is_device<DeviceType const &>{}); + +static_assert(!Kokkos::is_device<ExecutionSpace>{}); +static_assert(!Kokkos::is_device<MemorySpace>{}); + +static_assert(Kokkos::is_space<ExecutionSpace>{}); +static_assert(Kokkos::is_space<MemorySpace>{}); +static_assert(Kokkos::is_space<DeviceType>{}); +static_assert(Kokkos::is_space<ExecutionSpace const>{}); +static_assert(Kokkos::is_space<MemorySpace const>{}); +static_assert(Kokkos::is_space<DeviceType const>{}); +static_assert(!Kokkos::is_space<ExecutionSpace &>{}); +static_assert(!Kokkos::is_space<MemorySpace &>{}); +static_assert(!Kokkos::is_space<DeviceType &>{}); + +static_assert(Kokkos::is_execution_space_v<ExecutionSpace>); +static_assert(!Kokkos::is_execution_space_v<ExecutionSpace &>); static_assert( - std::is_same<float, Kokkos::Impl::remove_cvref_t<float const &>>{}, ""); -static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int &>>{}, ""); -static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int const>>{}, ""); -static_assert(std::is_same<float, Kokkos::Impl::remove_cvref_t<float>>{}, ""); + std::is_same<float, Kokkos::Impl::remove_cvref_t<float const &>>{}); +static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int &>>{}); +static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int const>>{}); +static_assert(std::is_same<float, Kokkos::Impl::remove_cvref_t<float>>{}); /*------------------------------------------------- begin test for team_handle concept @@ -122,8 +122,9 @@ struct is_team_handle_complete_trait_check { decltype(std::declval<U const &>().team_barrier()); template <class U> - using TeamBroadcastArchetypeExpr = decltype( - std::declval<U const &>().team_broadcast(lvalueForMethodsNeedingIt_, 0)); + using TeamBroadcastArchetypeExpr = + decltype(std::declval<U const &>().team_broadcast( + lvalueForMethodsNeedingIt_, 0)); template <class U> using TeamBroadcastAcceptClosureArchetypeExpr = diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp index 8e5ff3b9af6091ca2fecb07c228a6ab58b9ed5df..e6d3c735e768bb2d02dc14439db8bee9c1fdfe07 100644 --- a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp +++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp @@ -19,7 +19,6 @@ namespace Test { -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Impl { template <class MemorySpaceA, class MemorySpaceB> struct TestDeepCopy { @@ -228,7 +227,6 @@ TEST(TEST_CATEGORY, deep_copy_alignment) { Kokkos::HostSpace>::run_test(100000); } } -#endif namespace Impl { template <class Scalar1, class Scalar2, class Layout1, class Layout2> @@ -242,10 +240,10 @@ struct TestDeepCopyScalarConversion { using view_type_s2_2d = Kokkos::View<Scalar2**, Layout2, TEST_EXECSPACE>; using base_layout1 = - std::conditional_t<std::is_same<Layout1, Kokkos::LayoutStride>::value, + std::conditional_t<std::is_same_v<Layout1, Kokkos::LayoutStride>, Kokkos::LayoutLeft, Layout1>; using base_layout2 = - std::conditional_t<std::is_same<Layout2, Kokkos::LayoutStride>::value, + std::conditional_t<std::is_same_v<Layout2, Kokkos::LayoutStride>, Kokkos::LayoutLeft, Layout2>; using base_type_s1_1d = Kokkos::View<Scalar1*, base_layout1, TEST_EXECSPACE>; diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp deleted file mode 100644 index 929c91db4e00a37e6630e8f6c54a393ab08e6014..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ /dev/null @@ -1,491 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <gtest/gtest.h> - -#include <Kokkos_Core.hpp> - -#ifdef KOKKOS_ENABLE_OPENMP -#include <omp.h> -#endif -#include <set> -#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) - -namespace Test { - -namespace Impl { - -std::set<char*> delete_these; -void cleanup_memory() { - for (auto x : delete_these) { - delete[] x; - } -} - -char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, - bool do_other, bool do_tune, int& nargs, - Kokkos::InitArguments& init_args) { - nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + - (do_other ? 4 : 0) + (do_tune ? 1 : 0); - - char** args_kokkos = new char*[nargs]; - const int max_args_size = 45; - for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[max_args_size]; - delete_these.insert(args_kokkos[i]); - } - - int threads_idx = do_other ? 1 : 0; - int numa_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0); - int device_idx = - (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0); - int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + - (do_device ? 1 : 0); - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast<unsigned>(nthreads)) - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || - std::is_same<Kokkos::Serial, - Kokkos::DefaultHostExecutionSpace>::value) { - nthreads = 1; - } -#endif - - init_args.num_threads = nthreads; - snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || - std::is_same<Kokkos::Serial, - Kokkos::DefaultHostExecutionSpace>::value) { - numa = 1; - } -#endif - - init_args.num_numa = numa; - snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); - } - - if (do_device) { - init_args.device_id = 0; - snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); - } - - if (do_other) { - snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, - "--dummy2arg"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, - "dummy3arg"); - snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, - "dummy4arg=1"); - } - - if (do_tune) { - init_args.tune_internals = true; - snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); - } - - return args_kokkos; -} - -Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, - bool do_device, bool do_tune) { - Kokkos::InitArguments args; - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast<unsigned>(nthreads)) { - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || - std::is_same<Kokkos::Serial, - Kokkos::DefaultHostExecutionSpace>::value) { - nthreads = 1; - } -#endif - - args.num_threads = nthreads; - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || - std::is_same<Kokkos::Serial, - Kokkos::DefaultHostExecutionSpace>::value) { - numa = 1; - } -#endif - - args.num_numa = numa; - } - - if (do_device) { - args.device_id = 0; - } - - if (do_tune) { - args.tune_internals = true; - } - - return args; -} - -void check_correct_initialization(const Kokkos::InitArguments& argstruct) { - ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1); - ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1); - - // Figure out the number of threads the HostSpace ExecutionSpace should have - // initialized to. - int expected_nthreads = argstruct.num_threads; - -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same<Kokkos::HostSpace::execution_space, Kokkos::OpenMP>::value) { - // use openmp default num threads - if (expected_nthreads < 0 || - (expected_nthreads == 0 && !Kokkos::hwloc::available())) { - expected_nthreads = omp_get_max_threads(); - } - // use hwloc if available - else if (expected_nthreads == 0 && Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - } -#endif - - if (expected_nthreads < 1) { - if (Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } else { - expected_nthreads = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value || - std::is_same<Kokkos::DefaultHostExecutionSpace, - Kokkos::Serial>::value) { - expected_nthreads = 1; - } -#endif - -#ifdef KOKKOS_ENABLE_HPX - // HPX uses all cores on machine by default. Skip this test. - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::HPX>::value || - std::is_same<Kokkos::DefaultHostExecutionSpace, - Kokkos::Experimental::HPX>::value) { - return; - } -#endif - } - - int expected_numa = argstruct.num_numa; - - if (expected_numa < 1) { - if (Kokkos::hwloc::available()) { - expected_numa = Kokkos::hwloc::get_available_numa_count(); - } else { - expected_numa = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value || - std::is_same<Kokkos::DefaultHostExecutionSpace, Kokkos::Serial>::value) - expected_numa = 1; -#endif - } - - ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(), - expected_nthreads); - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>::value) { - int device; - cudaGetDevice(&device); - - int expected_device = argstruct.device_id; - if (argstruct.device_id < 0) { - expected_device = Kokkos::Cuda().cuda_device(); - } - - ASSERT_EQ(expected_device, device); - } -#endif - ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals()); -} - -// TODO: Add check whether correct number of threads are actually started. -void test_no_arguments() { - Kokkos::initialize(); - check_correct_initialization(Kokkos::InitArguments()); - Kokkos::finalize(); -} - -void test_commandline_args(int nargs, char** args, - const Kokkos::InitArguments& argstruct) { - Kokkos::initialize(nargs, args); - check_correct_initialization(argstruct); - Kokkos::finalize(); -} - -void test_initstruct_args(const Kokkos::InitArguments& args) { - Kokkos::initialize(args); - check_correct_initialization(args); - Kokkos::finalize(); -} - -} // namespace Impl - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -TEST(defaultdevicetypeinit, commandline_args_empty) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -TEST(defaultdevicetypeinit, commandline_args_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -TEST(defaultdevicetypeinit, commandline_args_nthreads) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -TEST(defaultdevicetypeinit, commandline_args_nthreads_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -TEST(defaultdevicetypeinit, commandline_args_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -TEST(defaultdevicetypeinit, commandline_args_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -TEST(defaultdevicetypeinit, initstruct_default) { - Kokkos::InitArguments args; - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -TEST(defaultdevicetypeinit, initstruct_nthreads) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -TEST(defaultdevicetypeinit, initstruct_device) { - Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -TEST(defaultdevicetypeinit, initstruct_nthreads_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true); - Impl::test_initstruct_args(args); -} -#endif - -} // namespace Test - -#endif diff --git a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp index 66fdecabaa285f8bd48184f61dcd8c166e49efb9..00d3b62d31237b81fbfe751023e93d5574e055ac 100644 --- a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp +++ b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp @@ -18,11 +18,11 @@ void test_nonesuch() { using Kokkos::nonesuch; - static_assert(!std::is_constructible<nonesuch>::value); - static_assert(!std::is_destructible<nonesuch>::value); - static_assert(!std::is_copy_constructible<nonesuch>::value); - static_assert(!std::is_move_constructible<nonesuch>::value); - static_assert(!std::is_aggregate<nonesuch>::value); + static_assert(!std::is_constructible_v<nonesuch>); + static_assert(!std::is_destructible_v<nonesuch>); + static_assert(!std::is_copy_constructible_v<nonesuch>); + static_assert(!std::is_move_constructible_v<nonesuch>); + static_assert(!std::is_aggregate_v<nonesuch>); } namespace Example { @@ -39,7 +39,7 @@ static_assert(Kokkos::is_detected<copy_assign_t, Meow>::value, "Meow should be copy assignable!"); static_assert(!Kokkos::is_detected<copy_assign_t, Purr>::value, "Purr should not be copy assignable!"); -static_assert(Kokkos::is_detected_exact<Meow&, copy_assign_t, Meow>::value, +static_assert(Kokkos::is_detected_exact_v<Meow&, copy_assign_t, Meow>, "Copy assignment of Meow should return Meow&!"); template <class T> @@ -53,8 +53,8 @@ struct Woof { }; struct Bark {}; -static_assert(std::is_same<difference_type<Woof>, int>::value, +static_assert(std::is_same_v<difference_type<Woof>, int>, "Woof's difference_type should be int!"); -static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value, +static_assert(std::is_same_v<difference_type<Bark>, std::ptrdiff_t>, "Bark's difference_type should be ptrdiff_t!"); } // namespace Example diff --git a/packages/kokkos/core/unit_test/TestDeviceAndThreads.py b/packages/kokkos/core/unit_test/TestDeviceAndThreads.py index 1d3ff8eea7e7577a63f2dab37a552195f86d5bc7..63d26ad41a4455d66c63fc6930cdff6795988502 100644 --- a/packages/kokkos/core/unit_test/TestDeviceAndThreads.py +++ b/packages/kokkos/core/unit_test/TestDeviceAndThreads.py @@ -17,6 +17,8 @@ import unittest import subprocess +import platform +import os PREFIX = "$<TARGET_FILE_DIR:Kokkos_CoreUnitTest_DeviceAndThreads>" EXECUTABLE = "$<TARGET_FILE_NAME:Kokkos_CoreUnitTest_DeviceAndThreads>" @@ -30,7 +32,22 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 3, 5, 7]: + args = [] + name = platform.system() + if name == 'Darwin': + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + elif name == 'Linux': + args = ['nproc', '--all'] + else: + args = ['wmic', 'cpu', 'get', 'NumberOfCores'] + + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + phys_cores_count = int(output) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \ + if GetFlag("hwloc_enabled") else [1,2,3,4,5] + + for x in looplist: if x >= max_threads: break yield x @@ -48,13 +65,25 @@ class KokkosInitializationTestCase(unittest.TestCase): "num_threads", "--kokkos-num-threads={}".format(num_threads))) + def test_num_devices(self): + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + self.assertNotEqual(num_devices, 0) + if num_devices == -1: + self.skipTest("no device backend enabled") + self.assertGreaterEqual(num_devices, 1) + def test_device_id(self): - device_count = GetFlag("device_count") - if device_count == 0: - self.skipTest("no device detected") + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + if num_devices == -1: + self.assertEqual(-1, GetFlag("device_id")) + self.skipTest("no device backend enabled") # by default use the first GPU available for execution self.assertEqual(0, GetFlag("device_id")) - for device_id in range(device_count): + for device_id in range(num_devices): self.assertEqual( device_id, GetFlag( diff --git a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp index 65314d6be7cf77f34542d669fcb7e9e74a75622c..a5d0009a664fd1f38f97570b7829afb3dd5dad9f 100644 --- a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp +++ b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp @@ -28,6 +28,17 @@ struct SumFunctor { void operator()(int i, int& lsum) const { lsum += i; } }; +template <class ExecSpace> +void check_space_member_for_policies(const ExecSpace& exec) { + Kokkos::RangePolicy<ExecSpace> range_policy(exec, 0, 1); + ASSERT_EQ(range_policy.space(), exec); + Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>> mdrange_policy(exec, {0, 0}, + {1, 1}); + ASSERT_EQ(mdrange_policy.space(), exec); + Kokkos::TeamPolicy<ExecSpace> team_policy(exec, 1, Kokkos::AUTO); + ASSERT_EQ(team_policy.space(), exec); +} + template <class ExecSpace> void check_distinctive([[maybe_unused]] ExecSpace exec1, [[maybe_unused]] ExecSpace exec2) { @@ -52,17 +63,25 @@ void check_distinctive([[maybe_unused]] ExecSpace exec1, } #endif #ifdef KOKKOS_ENABLE_SYCL - if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::SYCL>) { + if constexpr (std::is_same_v<ExecSpace, Kokkos::SYCL>) { ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue, *exec2.impl_internal_space_instance()->m_queue); } #endif +#ifdef KOKKOS_ENABLE_HPX + if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::HPX>) { + ASSERT_NE(exec1.impl_instance_id(), exec2.impl_instance_id()); + } +#endif } } // namespace #ifdef KOKKOS_ENABLE_OPENMP template <class Lambda1, class Lambda2> void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + if (omp_get_max_threads() < 2) + GTEST_SKIP() << "insufficient number of supported concurrent threads"; + #pragma omp parallel num_threads(2) { if (omp_get_thread_num() == 0) l1(); @@ -89,6 +108,9 @@ void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { void test_partitioning(std::vector<TEST_EXECSPACE>& instances) { check_distinctive(instances[0], instances[1]); + check_space_member_for_policies(instances[0]); + check_space_member_for_policies(instances[1]); + int sum1, sum2; int N = 3910; run_threaded_test( diff --git a/packages/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp b/packages/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp new file mode 100644 index 0000000000000000000000000000000000000000..229d6ac2b6a6e9e7bd0903f788ca41fc5c561514 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestExecSpaceThreadSafety.hpp @@ -0,0 +1,370 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include <thread> + +namespace { + +#ifdef KOKKOS_COMPILER_NVHPC +#define THREAD_SAFETY_TEST_UNREACHABLE() __builtin_unreachable() +#else +#define THREAD_SAFETY_TEST_UNREACHABLE() static_assert(true) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +template <class Lambda1, class Lambda2> +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + if (omp_get_max_threads() < 2) + GTEST_SKIP() << "insufficient number of supported concurrent threads"; + +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 0) l1(); + if (omp_get_thread_num() == 1) l2(); + } +} +// We cannot run the multithreaded test when threads or HPX is enabled because +// we cannot launch a thread from inside another thread +#elif !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_HPX) +template <class Lambda1, class Lambda2> +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + std::thread t1(l1); + std::thread t2(l2); + t1.join(); + t2.join(); +} +#else +template <class Lambda1, class Lambda2> +void run_threaded_test(const Lambda1 l1, const Lambda2 l2) { + l1(); + l2(); +} +#endif + +// The idea for all of these tests is to access a View from kernels submitted by +// two different threads to the same execution space instance. If the kernels +// are executed concurrently, we expect to count too many increments. +void run_exec_space_thread_safety_range() { + constexpr int N = 10000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 1), KOKKOS_LAMBDA(int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_range(); +} + +void run_exec_space_thread_safety_mdrange() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail for OpenMPTarget"; +#endif + run_exec_space_thread_safety_mdrange(); +} + +void run_exec_space_thread_safety_team_policy() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type + &team_member) { + Kokkos::single(Kokkos::PerTeam(team_member), [=]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + }); + }); + } + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + run_exec_space_thread_safety_team_policy(); +} + +void run_exec_space_thread_safety_range_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 1), + KOKKOS_LAMBDA(int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_reduce) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif + run_exec_space_thread_safety_range_reduce(); +} + +void run_exec_space_thread_safety_mdrange_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(exec, {0, 0}, + {1, 1}), + KOKKOS_LAMBDA(int, int, int &update) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }, + error); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_mdrange_reduce) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif +// FIXME_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && defined(KOKKOS_ENABLE_OPENMP) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::OpenMP>) + GTEST_SKIP() << "skipping since test is known to fail for OpenMP using the " + "legacy Intel compiler"; +#endif + run_exec_space_thread_safety_mdrange_reduce(); +} + +void run_exec_space_thread_safety_team_policy_reduce() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec, 1, 1, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type + &team_member, + int &update) { + Kokkos::single(Kokkos::PerTeam(team_member), [=, &update]() { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) ++update; + }); + }, + error); + } + }; + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_team_policy_reduce) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping for OpenMPTarget since the test is designed to " + "run with vector_length=1"; +#endif + // FIXME_SYCL +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) + GTEST_SKIP() << "skipping since test is know to fail with SYCL+Cuda"; +#endif + run_exec_space_thread_safety_team_policy_reduce(); +} + +void run_exec_space_thread_safety_range_scan() { + constexpr int N = 1000000; + constexpr int M = 10; + + Kokkos::View<int, TEST_EXECSPACE> view("view"); + Kokkos::View<int, TEST_EXECSPACE> error("error"); + + auto lambda = [=]() { + TEST_EXECSPACE exec; + for (int j = 0; j < M; ++j) { + Kokkos::parallel_scan( + Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 1), + KOKKOS_LAMBDA(int, int &, const bool final) { + if (final) { + Kokkos::atomic_store(view.data(), 0); + for (int i = 0; i < N; ++i) Kokkos::atomic_inc(view.data()); + if (Kokkos::atomic_load(view.data()) != N) + Kokkos::atomic_store(error.data(), 1); + } + }); + } + exec.fence(); + }; + + run_threaded_test(lambda, lambda); + + auto host_error = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, error); + ASSERT_EQ(host_error(), 0); +} + +TEST(TEST_CATEGORY, exec_space_thread_safety_range_scan) { +#ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC + GTEST_SKIP() + << "skipping OpenACC test since unsupported host-side atomics cause " + "race conditions during shared allocation reference counting"; + THREAD_SAFETY_TEST_UNREACHABLE(); +#endif + run_exec_space_thread_safety_range_scan(); +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestExecutionSpace.hpp b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp index 6f0f159c1740546d5d664fef74abf1f689e8a55b..e38e272d3d9928eae68a8fded3ff95d580d81a64 100644 --- a/packages/kokkos/core/unit_test/TestExecutionSpace.hpp +++ b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp @@ -25,13 +25,7 @@ struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable { Kokkos::DefaultExecutionSpace device; Kokkos::DefaultHostExecutionSpace host; - KOKKOS_FUNCTION void operator()(int, int& e) const { - // not actually doing anything useful, mostly checking that - // ExecutionSpace::in_parallel() is callable - if (static_cast<int>(device.in_parallel()) < 0) { - ++e; - } - } + KOKKOS_FUNCTION void operator()(int i, int& e) const { e += i; } CheckClassWithExecutionSpaceAsDataMemberIsCopyable() { int errors; @@ -50,4 +44,59 @@ TEST(TEST_CATEGORY, execution_space_as_class_data_member) { } #endif +constexpr bool test_execspace_explicit_construction() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_SERIAL + static_assert(std::is_convertible_v<Kokkos::NewInstance, Kokkos::Serial>); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + static_assert(std::is_convertible_v<int, Kokkos::OpenMP>); +#endif +#ifdef KOKKOS_ENABLE_CUDA + static_assert(std::is_convertible_v<cudaStream_t, Kokkos::Cuda>); +#endif +#ifdef KOKKOS_ENABLE_HIP + static_assert(std::is_convertible_v<hipStream_t, Kokkos::HIP>); +#endif +#ifdef KOKKOS_ENABLE_HPX + static_assert(std::is_convertible_v<Kokkos::Experimental::HPX::instance_mode, + Kokkos::Experimental::HPX>); + static_assert( + std::is_convertible_v<hpx::execution::experimental::unique_any_sender<>&&, + Kokkos::Experimental::HPX>); +#endif +#else +#ifdef KOKKOS_ENABLE_SERIAL + static_assert(!std::is_convertible_v<Kokkos::NewInstance, Kokkos::Serial>); +#endif +#ifdef KOKKOS_ENABLE_OPENMP + static_assert(!std::is_convertible_v<int, Kokkos::OpenMP>); +#endif +#ifdef KOKKOS_ENABLE_CUDA + static_assert(!std::is_convertible_v<cudaStream_t, Kokkos::Cuda>); +#endif +#ifdef KOKKOS_ENABLE_HIP + static_assert(!std::is_convertible_v<hipStream_t, Kokkos::HIP>); +#endif +#ifdef KOKKOS_ENABLE_HPX + static_assert(!std::is_convertible_v<Kokkos::Experimental::HPX::instance_mode, + Kokkos::Experimental::HPX>); + static_assert(!std::is_convertible_v< + hpx::execution::experimental::unique_any_sender<>&&, + Kokkos::Experimental::HPX>); +#endif +#endif + +#ifdef KOKKOS_ENABLE_OPENACC + static_assert(!std::is_convertible_v<int, Kokkos::Experimental::OpenACC>); +#endif +#ifdef KOKKOS_ENABLE_SYCL + static_assert(!std::is_convertible_v<sycl::queue, Kokkos::SYCL>); +#endif + + return true; +} + +static_assert(test_execspace_explicit_construction()); + } // namespace diff --git a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp index c024526111b12bef05d945fc82c837f2aa5cd782..f8a49094b7bbcb1063fdebec4b53d954f3822144 100644 --- a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp +++ b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp @@ -59,16 +59,15 @@ void test_functor_analysis() { using R01 = typename A01::Reducer; - static_assert(std::is_void<typename A01::value_type>::value, ""); - static_assert(std::is_void<typename A01::pointer_type>::value, ""); - static_assert(std::is_void<typename A01::reference_type>::value, ""); - static_assert(std::is_same<typename R01::functor_type, decltype(c01)>::value, - ""); - - static_assert(!A01::has_join_member_function, ""); - static_assert(!A01::has_init_member_function, ""); - static_assert(!A01::has_final_member_function, ""); - static_assert(A01::StaticValueSize == 0, ""); + static_assert(std::is_void_v<typename A01::value_type>); + static_assert(std::is_void_v<typename A01::pointer_type>); + static_assert(std::is_void_v<typename A01::reference_type>); + static_assert(std::is_same_v<typename R01::functor_type, decltype(c01)>); + + static_assert(!A01::has_join_member_function); + static_assert(!A01::has_init_member_function); + static_assert(!A01::has_final_member_function); + static_assert(A01::StaticValueSize == 0); ASSERT_EQ(R01(c01).length(), 0); //------------------------------ @@ -78,16 +77,15 @@ void test_functor_analysis() { Kokkos::RangePolicy<ExecSpace>, decltype(c02), void>; using R02 = typename A02::Reducer; - static_assert(std::is_same<typename A02::value_type, double>::value, ""); - static_assert(std::is_same<typename A02::pointer_type, double*>::value, ""); - static_assert(std::is_same<typename A02::reference_type, double&>::value, ""); - static_assert(std::is_same<typename R02::functor_type, decltype(c02)>::value, - ""); + static_assert(std::is_same_v<typename A02::value_type, double>); + static_assert(std::is_same_v<typename A02::pointer_type, double*>); + static_assert(std::is_same_v<typename A02::reference_type, double&>); + static_assert(std::is_same_v<typename R02::functor_type, decltype(c02)>); - static_assert(!A02::has_join_member_function, ""); - static_assert(!A02::has_init_member_function, ""); - static_assert(!A02::has_final_member_function, ""); - static_assert(A02::StaticValueSize == sizeof(double), ""); + static_assert(!A02::has_join_member_function); + static_assert(!A02::has_init_member_function); + static_assert(!A02::has_final_member_function); + static_assert(A02::StaticValueSize == sizeof(double)); ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -98,24 +96,20 @@ void test_functor_analysis() { Kokkos::RangePolicy<ExecSpace>, TestFunctorAnalysis_03, void>; using R03 = typename A03::Reducer; - static_assert(std::is_same<typename A03::value_type, - TestFunctorAnalysis_03::value_type>::value, - ""); - static_assert(std::is_same<typename A03::pointer_type, - TestFunctorAnalysis_03::value_type*>::value, - ""); - static_assert(std::is_same<typename A03::reference_type, - TestFunctorAnalysis_03::value_type&>::value, - ""); + static_assert(std::is_same_v<typename A03::value_type, + TestFunctorAnalysis_03::value_type>); + static_assert(std::is_same_v<typename A03::pointer_type, + TestFunctorAnalysis_03::value_type*>); + static_assert(std::is_same_v<typename A03::reference_type, + TestFunctorAnalysis_03::value_type&>); static_assert( - std::is_same<typename R03::functor_type, TestFunctorAnalysis_03>::value, - ""); + std::is_same_v<typename R03::functor_type, TestFunctorAnalysis_03>); - static_assert(A03::has_join_member_function, ""); - static_assert(A03::has_init_member_function, ""); - static_assert(!A03::has_final_member_function, ""); - static_assert( - A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + static_assert(A03::has_join_member_function); + static_assert(A03::has_init_member_function); + static_assert(!A03::has_final_member_function); + static_assert(A03::StaticValueSize == + sizeof(TestFunctorAnalysis_03::value_type)); ASSERT_EQ(R03(c03).length(), 1); //------------------------------ diff --git a/packages/kokkos/core/unit_test/TestGraph.hpp b/packages/kokkos/core/unit_test/TestGraph.hpp index 9a36d08f445aa12841b3d1723729a9c518573379..946604ce388e788e6363032ef53394d6d7b083c2 100644 --- a/packages/kokkos/core/unit_test/TestGraph.hpp +++ b/packages/kokkos/core/unit_test/TestGraph.hpp @@ -19,8 +19,25 @@ #include <gtest/gtest.h> +#include <tools/include/ToolTestingUtilities.hpp> + namespace Test { +template <class ExecSpace, class ValueType> +struct NoOpReduceFunctor { + KOKKOS_FUNCTION void operator()(int, ValueType&) const { + Kokkos::abort("Should never be called!"); + } + KOKKOS_FUNCTION void operator()(int, int, ValueType&) const { + Kokkos::abort("Should never be called!"); + } + KOKKOS_FUNCTION void operator()( + const typename Kokkos::TeamPolicy<ExecSpace>::member_type&, + ValueType&) const { + Kokkos::abort("Should never be called!"); + } +}; + template <class ExecSpace> struct CountTestFunctor { using value_type = int; @@ -66,7 +83,7 @@ struct SetResultToViewFunctor { } }; -struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { +struct TEST_CATEGORY_FIXTURE(graph) : public ::testing::Test { public: using count_functor = CountTestFunctor<TEST_EXECSPACE>; using set_functor = SetViewToValueFunctor<TEST_EXECSPACE, int>; @@ -88,31 +105,179 @@ struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { } }; -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { +// Check if a rank-0 view contains a given value. +template <typename Exec, typename ViewType> +::testing::AssertionResult contains( + const Exec& exec, const ViewType& view, + const typename ViewType::value_type& expected) { + static_assert(ViewType::rank() == 0); + typename ViewType::non_const_value_type value; + Kokkos::deep_copy(exec, value, view); + exec.fence(); + if (value != expected) + return ::testing::AssertionFailure() + << expected << " is not in " << view.label() << ", got " << value; + else + return ::testing::AssertionSuccess(); +} + +TEST_F(TEST_CATEGORY_FIXTURE(graph), submit_once) { auto graph = Kokkos::Experimental::create_graph<TEST_EXECSPACE>([&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); }); graph.submit(); - Kokkos::deep_copy(graph.get_execution_space(), count_host, count); - Kokkos::deep_copy(graph.get_execution_space(), bugs_host, bugs); - graph.get_execution_space().fence(); - ASSERT_EQ(1, count_host()); - ASSERT_EQ(0, bugs_host()); + + ASSERT_TRUE(contains(graph.get_execution_space(), count, 1)); + ASSERT_TRUE(contains(graph.get_execution_space(), bugs, 0)); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), submit_once_rvalue) { Kokkos::Experimental::create_graph(ex, [&](auto root) { root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); }).submit(); - Kokkos::deep_copy(ex, count_host, count); - Kokkos::deep_copy(ex, bugs_host, bugs); - ex.fence(); - ASSERT_EQ(1, count_host()); - ASSERT_EQ(0, bugs_host()); + + ASSERT_TRUE(contains(ex, count, 1)); + ASSERT_TRUE(contains(ex, bugs, 0)); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { +// Ensure that Kokkos::Graph::instantiate works. +// For now, Kokkos::Graph::submit will instantiate if needed, +// so this test is not very strong. +TEST_F(TEST_CATEGORY_FIXTURE(graph), instantiate_and_submit_once) { + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { + root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + }); + graph.instantiate(); + graph.submit(); + + ASSERT_TRUE(contains(ex, count, 1)); + ASSERT_TRUE(contains(ex, bugs, 0)); +} + +// FIXME death tests and fixtures +#define TEST_CATEGORY_FIXTURE_DEATH_HELPER(category, name) \ + category##_##name##_DeathTest +#define TEST_CATEGORY_FIXTURE_DEATH_HELPER_EXPAND(category, name) \ + TEST_CATEGORY_FIXTURE_DEATH_HELPER(category, name) +#define TEST_CATEGORY_FIXTURE_DEATH(name) \ + TEST_CATEGORY_FIXTURE_DEATH_HELPER_EXPAND(TEST_CATEGORY, name) + +struct TEST_CATEGORY_FIXTURE_DEATH(graph) + : public TEST_CATEGORY_FIXTURE(graph) {}; + +// Ensure that Kokkos::Graph::instantiate can be called only once. +// This test checks 2 cases: +// 1. Instantiating after submission is invalid (this also implicitly +// checks that submission instantiates if need be). +// 2. Instantiating twice in a row is invalid. +TEST_F(TEST_CATEGORY_FIXTURE_DEATH(graph), can_instantiate_only_once) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + { + bool checked_assertions = false; + KOKKOS_ASSERT(checked_assertions = true); + if (!checked_assertions) { + GTEST_SKIP() << "Preconditions are not checked."; + } + } + { + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { + root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + }); + graph.submit(); + ASSERT_DEATH(graph.instantiate(), + "Expected precondition `.*` evaluated false."); + } + { + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { + root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + }); + graph.instantiate(); + ASSERT_DEATH(graph.instantiate(), + "Expected precondition `.*` evaluated false."); + } +} + +// This test submits on an execution space instance different from the +// one passed to the Kokkos::Graph constructor. +TEST_F(TEST_CATEGORY_FIXTURE(graph), + submit_onto_another_execution_space_instance) { +#ifdef KOKKOS_ENABLE_OPENMP // FIXME_OPENMP partition_space + if (ex.concurrency() < 2) + GTEST_SKIP() << "insufficient number of supported concurrent threads"; +#endif + + const auto execution_space_instances = + Kokkos::Experimental::partition_space(ex, 1, 1); + + auto graph = Kokkos::Experimental::create_graph( + execution_space_instances.at(0), [&](auto root) { + root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + }); + graph.instantiate(); + + execution_space_instances.at(0).fence( + "The graph might make async copies to device."); + + graph.submit(execution_space_instances.at(1)); + + ASSERT_TRUE(contains(execution_space_instances.at(1), count, 1)); + ASSERT_TRUE(contains(execution_space_instances.at(1), bugs, 0)); +} + +// This test ensures that it's possible to build a Kokkos::Graph using +// Kokkos::Experimental::create_graph without providing a closure, but giving an +// execution space instance. +TEST_F(TEST_CATEGORY_FIXTURE(graph), create_graph_no_closure_with_exec) { + auto graph = Kokkos::Experimental::create_graph(ex); + + auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph); + + auto node = root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + + graph.submit(ex); + + ASSERT_TRUE(contains(ex, count, 1)); + ASSERT_TRUE(contains(ex, bugs, 0)); +} + +// This test ensures that it's possible to build a Kokkos::Graph using +// Kokkos::Experimental::create_graph without any argument. +// The test has to be skipped if the test fixture is +// not instantiated for the default execution space. +TEST_F(TEST_CATEGORY_FIXTURE(graph), create_graph_no_arg) { + if constexpr (!std::is_same_v<TEST_EXECSPACE, + Kokkos::DefaultExecutionSpace>) { + GTEST_SKIP() << "Skipping since useless if the test fixture is not on the " + "default execution space."; + } + + auto graph = Kokkos::Experimental::create_graph(); + + static_assert(std::is_same_v<typename decltype(graph)::execution_space, + Kokkos::DefaultExecutionSpace>); + + auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph); + + auto node = root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + + graph.submit(graph.get_execution_space()); + + ASSERT_TRUE(contains(graph.get_execution_space(), count, 1)); + ASSERT_TRUE(contains(graph.get_execution_space(), bugs, 0)); +} + +TEST_F(TEST_CATEGORY_FIXTURE(graph), submit_six) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET team_size incompatible + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1"; +#endif +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(SYCL_EXT_ONEAPI_GRAPH) // FIXME_SYCL + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) + GTEST_SKIP() << "skipping since test case is known to fail with SYCL"; +#endif + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); @@ -137,15 +302,12 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { //---------------------------------------- }); graph.submit(); - Kokkos::deep_copy(ex, count_host, count); - Kokkos::deep_copy(ex, bugs_host, bugs); - ex.fence(); - ASSERT_EQ(6, count_host()); - ASSERT_EQ(0, bugs_host()); + ASSERT_TRUE(contains(ex, count, 6)); + ASSERT_TRUE(contains(ex, bugs, 0)); } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), when_all_cycle) { view_type reduction_out{"reduction_out"}; view_host reduction_host{"reduction_host"}; Kokkos::Experimental::create_graph(ex, [&](auto root) { @@ -160,21 +322,33 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { .then_parallel_reduce(6, set_result_functor{count}, reduction_out); //---------------------------------------- }).submit(); - Kokkos::deep_copy(ex, bugs_host, bugs); - Kokkos::deep_copy(ex, count_host, count); - Kokkos::deep_copy(ex, reduction_host, reduction_out); - ex.fence(); - ASSERT_EQ(0, bugs_host()); - ASSERT_EQ(7, count_host()); - ASSERT_EQ(42, reduction_host()); + + ASSERT_TRUE(contains(ex, bugs, 0)); + ASSERT_TRUE(contains(ex, count, 7)); + ASSERT_TRUE(contains(ex, reduction_out, 42)); //---------------------------------------- } -// This test is disabled because we don't currently support copying to host, +// This test requires that the graph execution space can access +// the host memoy space because we don't currently support copying to host, // even asynchronously. We _may_ want to do that eventually? -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { - auto graph = Kokkos::Experimental::create_graph( - ex, [&, count_host = count_host](auto root) { +TEST_F(TEST_CATEGORY_FIXTURE(graph), repeat_chain) { + constexpr bool result_not_accessible_by_exec = !Kokkos::SpaceAccessibility< + TEST_EXECSPACE, decltype(bugs_host)::memory_space>::accessible; + + if constexpr (result_not_accessible_by_exec) { + GTEST_SKIP() << "The graph requires the reduction targets like 'bugs_host' " + "to be accessible by the execution space."; + } else { + auto graph = Kokkos::Experimental::create_graph(ex, [&, count_host = + count_host]( + auto root) { + // FIXME_CLANG Recent clang versions would still trigger a similar + // static_assert without the additional if constexpr + constexpr bool result_not_accessible_by_exec_copy = + !Kokkos::SpaceAccessibility< + TEST_EXECSPACE, decltype(bugs_host)::memory_space>::accessible; + if constexpr (!result_not_accessible_by_exec_copy) { //---------------------------------------- root.then_parallel_for(1, set_functor{count, 0}) .then_parallel_for(1, count_functor{count, bugs, 0, 0}) @@ -184,24 +358,43 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { 1, set_result_functor{bugs}, Kokkos::Sum<int, Kokkos::HostSpace>{bugs_host}); //---------------------------------------- - }); + } + }); - //---------------------------------------- - constexpr int repeats = 10; + //---------------------------------------- + constexpr int repeats = 10; - for (int i = 0; i < repeats; ++i) { - graph.submit(); - ex.fence(); - EXPECT_EQ(2, count_host()); - EXPECT_EQ(0, bugs_host()); + for (int i = 0; i < repeats; ++i) { + graph.submit(); + ex.fence(); + EXPECT_EQ(2, count_host()); + EXPECT_EQ(0, bugs_host()); + } + //---------------------------------------- } - //---------------------------------------- } -TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { - auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { - root.then_parallel_reduce(0, set_result_functor{bugs}, count); - }); +TEST_F(TEST_CATEGORY_FIXTURE(graph), zero_work_reduce) { + auto graph = Kokkos::Experimental::create_graph( + ex, [&](Kokkos::Experimental::GraphNodeRef<TEST_EXECSPACE> root) { + NoOpReduceFunctor<TEST_EXECSPACE, int> no_op_functor; + root.then_parallel_reduce(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 0), + no_op_functor, count) +#if !defined(KOKKOS_ENABLE_SYCL) || \ + defined(SYCL_EXT_ONEAPI_GRAPH) // FIXME_SYCL +#if !defined(KOKKOS_ENABLE_CUDA) && \ + !defined(KOKKOS_ENABLE_HIP) // FIXME_CUDA FIXME_HIP + .then_parallel_reduce( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{0, 0}, + {0, 0}}, + no_op_functor, count) +#endif + .then_parallel_reduce( + Kokkos::TeamPolicy<TEST_EXECSPACE>{0, Kokkos::AUTO}, + no_op_functor, count) +#endif + ; + }); // These fences are only necessary because of the weirdness of how CUDA // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ @@ -214,12 +407,336 @@ TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { // UVM works on pre pascal cards. #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) - Kokkos::fence(); + if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) Kokkos::fence(); #endif - graph.submit(); // should reset to 0, but doesn't - Kokkos::deep_copy(ex, count_host, count); + graph.submit(); + + ASSERT_TRUE(contains(ex, count, 0)); +} + +// Ensure that an empty graph can be submitted. +TEST_F(TEST_CATEGORY_FIXTURE(graph), empty_graph) { + auto graph = Kokkos::Experimental::create_graph(ex, [](auto) {}); + graph.instantiate(); + graph.submit(ex); ex.fence(); - ASSERT_EQ(count_host(), 0); +} + +template <typename ViewType> +struct ForceGlobalLaunchFunctor { + public: + static constexpr size_t count = +#if defined(KOKKOS_ENABLE_CUDA) + Kokkos::Impl::CudaTraits::ConstantMemoryUsage + +#elif defined(KOKKOS_ENABLE_HIP) + Kokkos::Impl::HIPTraits::ConstantMemoryUsage + +#endif + 1; + + ViewType data; + + ForceGlobalLaunchFunctor(ViewType data_) : data(std::move(data_)) {} + + template <typename T> + KOKKOS_FUNCTION void operator()(const T) const { + ++data(); + } + + private: + std::byte unused[count] = {}; +}; + +// Ensure that "global memory launch" path works. +TEST_F(TEST_CATEGORY_FIXTURE(graph), force_global_launch) { +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (!std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) { +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (!std::is_same_v<TEST_EXECSPACE, Kokkos::HIP>) { +#endif + GTEST_SKIP() << "This execution space does not support global launch."; + +#if defined(KOKKOS_ENABLE_CUDA) || \ + (defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH)) + } + using value_t = int; + using view_t = Kokkos::View<value_t, TEST_EXECSPACE, + Kokkos::MemoryTraits<Kokkos::Atomic>>; + using functor_t = ForceGlobalLaunchFunctor<view_t>; + + const std::string kernel_name = "Let's make it a huge kernel"; + const std::string alloc_label = + kernel_name + " - GraphNodeKernel global memory functor storage"; + + view_t data(Kokkos::view_alloc("witness", ex)); + + using namespace Kokkos::Test::Tools; + listen_tool_events(Config::DisableAll(), Config::EnableAllocs()); + + std::optional<Kokkos::Experimental::Graph<TEST_EXECSPACE>> graph; + + const void* ptr = nullptr; + uint64_t ptr_size = 0; + + ASSERT_TRUE(validate_existence( + [&]() { + graph = Kokkos::Experimental::create_graph(ex, [&](const auto& root) { + auto node = root.then_parallel_for( + kernel_name, + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(0, functor_t::count), + Kokkos::Experimental::WorkItemProperty::HintHeavyWeight), + functor_t(data)); + }); + }, + [&](AllocateDataEvent alloc) { + if (alloc.name != alloc_label) + return MatchDiagnostic{ + false, {"Allocation name mismatch (got " + alloc.name + ')'}}; + if (alloc.size < functor_t::count) + return MatchDiagnostic{ + false, + {"Allocation size mismatch (expected at least " + + std::to_string(functor_t::count) + " but got " + + std::to_string(alloc.size) + ')'}}; + ptr = alloc.ptr; + ptr_size = alloc.size; + return MatchDiagnostic{true}; + })); + + graph->instantiate(); + + // Fencing the default execution space instance, as the node policy + // was created without giving an instance (it used the default one). + TEST_EXECSPACE{}.fence( + "Ensure that kernel dispatch to global memory is finished " + "before submission."); + + graph->submit(ex); + ASSERT_TRUE(contains(ex, data, functor_t::count)); + + ASSERT_TRUE(validate_event_set( + [&]() { graph.reset(); }, + [&](DeallocateDataEvent dealloc) { + if (dealloc.name == alloc_label && dealloc.ptr == ptr && + dealloc.size == ptr_size) + return MatchDiagnostic{true}; + return MatchDiagnostic{ + false, {"Either the name or pointer or size did not match"}}; + })); + + listen_tool_events(Config::DisableAll()); +#endif +} + +// Ensure that an empty graph on the default host execution space +// can be submitted. +TEST_F(TEST_CATEGORY_FIXTURE(graph), empty_graph_default_host_exec) { + auto graph = + Kokkos::Experimental::create_graph(Kokkos::DefaultHostExecutionSpace{}); + graph.instantiate(); + graph.submit(); + graph.get_execution_space().fence(); +} + +template <typename ViewType, size_t TargetIndex, size_t NumIndices = 0> +struct FetchValuesAndContribute { + static_assert(std::is_same_v<typename ViewType::value_type, + typename ViewType::non_const_value_type>); + + ViewType data; + typename ViewType::value_type value; + Kokkos::Array<size_t, NumIndices> indices{}; + + FetchValuesAndContribute(ViewType data_, + std::integral_constant<size_t, TargetIndex>, + typename ViewType::value_type value_) + : data(std::move(data_)), value(value_) {} + + FetchValuesAndContribute(ViewType data_, + Kokkos::Array<size_t, NumIndices> indices_, + std::integral_constant<size_t, TargetIndex>, + typename ViewType::value_type value_) + : data(std::move(data_)), value(value_), indices(std::move(indices_)) {} + + template <typename T> + KOKKOS_FUNCTION void operator()(const T) const { + for (const auto index : indices) data(TargetIndex) += data(index); + data(TargetIndex) += value; + } +}; + +template <typename ViewType, size_t TargetIndex, size_t NumIndices> +FetchValuesAndContribute(ViewType, const size_t (&)[NumIndices], + std::integral_constant<size_t, TargetIndex>, + typename ViewType::non_const_value_type) + -> FetchValuesAndContribute<ViewType, TargetIndex, NumIndices>; + +// Ensure that we can handle the simple diamond use case. +// +// topology stream-based approach graph-based +// +// A A(exec_0) Using the API to add nodes, no +// / \ fence(exec_0) user-facing fence anymore because +// B C B(exec_0) C(exec_1) we'd like to rely on the graph to +// \ / fence(exec_1) enforce dependencies. +// D D(exec_0) +TEST_F(TEST_CATEGORY_FIXTURE(graph), diamond) { +#ifdef KOKKOS_ENABLE_OPENMP // FIXME_OPENMP partition_space + if (ex.concurrency() < 4) + GTEST_SKIP() << "test needs at least 4 OpenMP threads"; +#endif + + const auto execution_space_instances = + Kokkos::Experimental::partition_space(ex, 1, 1, 1, 1); + + const auto exec_0 = execution_space_instances.at(0); + const auto exec_1 = execution_space_instances.at(1); + const auto exec_2 = execution_space_instances.at(2); + const auto exec_3 = execution_space_instances.at(3); + + using policy_t = Kokkos::RangePolicy<TEST_EXECSPACE>; + using view_t = Kokkos::View<int*, TEST_EXECSPACE>; + using view_h_t = Kokkos::View<int*, Kokkos::HostSpace>; + + view_t data(Kokkos::view_alloc(ex, "diamond - data"), 4); + + constexpr int value_A = 42, value_B = 27, value_C = 13, value_D = 147; + std::integral_constant<size_t, 0> index_A; + std::integral_constant<size_t, 1> index_B; + std::integral_constant<size_t, 2> index_C; + std::integral_constant<size_t, 3> index_D; + + auto graph = Kokkos::Experimental::create_graph(exec_2, [&](auto root) { + auto node_A = root.then_parallel_for( + policy_t(exec_0, 0, 1), + FetchValuesAndContribute(data, index_A, value_A)); + + auto node_B = node_A.then_parallel_for( + policy_t(exec_0, 0, 1), + FetchValuesAndContribute(data, {index_A()}, index_B, value_B)); + auto node_C = node_A.then_parallel_for( + policy_t(exec_1, 0, 1), + FetchValuesAndContribute(data, {index_A()}, index_C, value_C)); + + auto node_D = Kokkos::Experimental::when_all(node_B, node_C) + .then_parallel_for( + policy_t(exec_0, 0, 1), + FetchValuesAndContribute(data, {index_B(), index_C()}, + index_D, value_D)); + }); + graph.instantiate(); + + // TODO Check that kernels are running on the execution space instance of + // their policy if the defaulted graph implementation is used. + graph.submit(exec_3); + + view_h_t data_host( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "diamond - data - host"), + 4); + Kokkos::deep_copy(exec_3, data_host, data); + + exec_3.fence(); + + ASSERT_EQ(data_host(index_A()), value_A); + ASSERT_EQ(data_host(index_B()), value_A + value_B); + ASSERT_EQ(data_host(index_C()), value_A + value_C); + ASSERT_EQ(data_host(index_D()), 2 * value_A + value_B + value_C + value_D); +} + +// Test a configuration that has more than one end node. Ensure that we wait for +// them all by adding a manual kernel after the graph. +// This test mainly is there to ensure that the defaulted graph implementation +// enforces a semantically consistent control flow. +// +// topology stream-based approach +// +// A B A(exec_0) B(exec_1) +// \ / | fence(exec_1) +// C | C(exec_0) +// / E E(exec_1) +// D D(exec_0) +// fence(exec_1) +// F F(exec_0) +TEST_F(TEST_CATEGORY_FIXTURE(graph), end_of_submit_control_flow) { +#ifdef KOKKOS_ENABLE_OPENMP // FIXME_OPENMP partition_space + if (ex.concurrency() < 4) + GTEST_SKIP() << "insufficient number of supported concurrent threads"; +#endif + + const auto execution_space_instances = + Kokkos::Experimental::partition_space(ex, 1, 1, 1, 1); + + const auto exec_0 = execution_space_instances.at(0); + const auto exec_1 = execution_space_instances.at(1); + const auto exec_2 = execution_space_instances.at(2); + const auto exec_3 = execution_space_instances.at(3); + + using policy_t = Kokkos::RangePolicy<TEST_EXECSPACE>; + using view_t = Kokkos::View<int*, TEST_EXECSPACE>; + using view_h_t = Kokkos::View<int*, Kokkos::HostSpace>; + + view_t data(Kokkos::view_alloc(ex, "data"), 6); + + constexpr int value_A = 42, value_B = 27, value_C = 13, value_D = 147, + value_E = 496, value_F = 123; + std::integral_constant<size_t, 0> index_A; + std::integral_constant<size_t, 1> index_B; + std::integral_constant<size_t, 2> index_C; + std::integral_constant<size_t, 3> index_D; + std::integral_constant<size_t, 4> index_E; + std::integral_constant<size_t, 5> index_F; + + auto graph = Kokkos::Experimental::create_graph(exec_2, [&](auto root) { + auto node_A = root.then_parallel_for( + policy_t(exec_0, 0, 1), + FetchValuesAndContribute(data, index_A, value_A)); + auto node_B = root.then_parallel_for( + policy_t(exec_1, 0, 1), + FetchValuesAndContribute(data, index_B, value_B)); + + auto node_C = Kokkos::Experimental::when_all(node_A, node_B) + .then_parallel_for( + policy_t(exec_0, 0, 1), + FetchValuesAndContribute(data, {index_A(), index_B()}, + index_C, value_C)); + + auto node_D = node_C.then_parallel_for( + policy_t(exec_0, 0, 1), + FetchValuesAndContribute(data, {index_C()}, index_D, value_D)); + auto node_E = node_B.then_parallel_for( + policy_t(exec_1, 0, 1), + FetchValuesAndContribute(data, {index_B()}, index_E, value_E)); + }); + graph.instantiate(); + + // TODO Check that kernels are running on the execution space instance of + // their policy if the defaulted graph implementation is used. + graph.submit(exec_3); + + // clang-format off + Kokkos::parallel_for( + policy_t(exec_3, 0, 1), +#if defined(KOKKOS_COMPILER_GNU) && (1010 == KOKKOS_COMPILER_GNU) + // Workaround CTAD bug, see 7316. + FetchValuesAndContribute<decltype(data), index_F, 2>(data, {index_D(), index_E()}, index_F, value_F)); +#else + FetchValuesAndContribute(data, {index_D(), index_E()}, index_F, value_F)); +#endif + // clang-format on + view_h_t data_host( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "data - host"), 6); + + Kokkos::deep_copy(exec_3, data_host, data); + + exec_3.fence(); + + ASSERT_EQ(data_host(index_A()), value_A); + ASSERT_EQ(data_host(index_B()), value_B); + ASSERT_EQ(data_host(index_C()), value_A + value_B + value_C); + ASSERT_EQ(data_host(index_D()), value_A + value_B + value_C + value_D); + ASSERT_EQ(data_host(index_E()), value_B + value_E); + ASSERT_EQ(data_host(index_F()), + value_A + 2 * value_B + value_C + value_D + value_E + value_F); } } // end namespace Test diff --git a/packages/kokkos/core/unit_test/TestHalfConversion.hpp b/packages/kokkos/core/unit_test/TestHalfConversion.hpp index acefac2692c2e3f54ede4485bfc6fe020985aba8..7fcf3855ae00ec2c6ae16ea5426e5722b68e416b 100644 --- a/packages/kokkos/core/unit_test/TestHalfConversion.hpp +++ b/packages/kokkos/core/unit_test/TestHalfConversion.hpp @@ -26,7 +26,6 @@ void test_half_conversion_type() { T b = Kokkos::Experimental::cast_from_half<T>(a); ASSERT_LT((double(b - base) / double(base)), epsilon); -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA Kokkos::View<T> b_v("b_v"); Kokkos::parallel_for( "TestHalfConversion", 1, KOKKOS_LAMBDA(int) { @@ -37,7 +36,6 @@ void test_half_conversion_type() { Kokkos::deep_copy(b, b_v); ASSERT_LT((double(b - base) / double(base)), epsilon); -#endif // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA } template <class T> @@ -48,7 +46,6 @@ void test_bhalf_conversion_type() { T b = Kokkos::Experimental::cast_from_bhalf<T>(a); ASSERT_LT((double(b - base) / double(base)), epsilon); -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA Kokkos::View<T> b_v("b_v"); Kokkos::parallel_for( "TestHalfConversion", 1, KOKKOS_LAMBDA(int) { @@ -59,7 +56,6 @@ void test_bhalf_conversion_type() { Kokkos::deep_copy(b, b_v); ASSERT_LT((double(b - base) / double(base)), epsilon); -#endif // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA } void test_half_conversion() { diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp index 752e3b5081612e70c30964ca9d7cec617ceb9161..3a48ba5bf7323695939ea17304b3a7b06d8ab447 100644 --- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp +++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp @@ -39,6 +39,13 @@ double accept_ref_expected(const bhalf_t& a) { } #endif // !KOKKOS_BHALF_T_IS_FLOAT +struct Batch0 {}; +struct Batch1 {}; +struct Batch2 {}; +struct Batch3 {}; +struct Batch4 {}; +struct Batch5 {}; + enum OP_TESTS { ASSIGN, ASSIGN_CHAINED, @@ -68,6 +75,7 @@ enum OP_TESTS { CDIV_S_H, CDIV_H_D, CDIV_D_H, + N_OP_TESTS_BATCH_0, ADD_H_H, ADD_H_S, ADD_S_H, @@ -110,6 +118,7 @@ enum OP_TESTS { ADD_H_ULI_SZ, ADD_H_ULLI, ADD_H_ULLI_SZ, + N_OP_TESTS_BATCH_1, SUB_H_H, SUB_H_S, SUB_S_H, @@ -152,6 +161,7 @@ enum OP_TESTS { SUB_H_ULI_SZ, SUB_H_ULLI, SUB_H_ULLI_SZ, + N_OP_TESTS_BATCH_2, MUL_H_H, MUL_H_S, MUL_S_H, @@ -194,6 +204,7 @@ enum OP_TESTS { MUL_H_ULI_SZ, MUL_H_ULLI, MUL_H_ULLI_SZ, + N_OP_TESTS_BATCH_3, DIV_H_H, DIV_H_S, DIV_S_H, @@ -236,6 +247,7 @@ enum OP_TESTS { DIV_H_ULI_SZ, DIV_H_ULLI, DIV_H_ULLI_SZ, + N_OP_TESTS_BATCH_4, NEG, AND, OR, @@ -268,96 +280,6 @@ enum OP_TESTS { N_OP_TESTS }; -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) -template <class view_type, class half_type> -struct Functor_TestHalfVolatileOperators { - volatile half_type h_lhs, h_rhs; - view_type actual_lhs, expected_lhs; - double d_lhs, d_rhs; - Functor_TestHalfVolatileOperators(volatile half_type lhs = half_type(0), - volatile half_type rhs = half_type(0)) - : h_lhs(lhs), h_rhs(rhs) { - actual_lhs = view_type("actual_lhs", N_OP_TESTS); - expected_lhs = view_type("expected_lhs", N_OP_TESTS); - half_type nv_tmp; - nv_tmp = h_lhs; - d_lhs = static_cast<double>(nv_tmp); - nv_tmp = h_rhs; - d_rhs = static_cast<double>(nv_tmp); - if (std::is_same<view_type, ViewTypeHost>::value) { - auto run_on_host = *this; - run_on_host(0); - } else { - Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators", - Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this); - } - } - - KOKKOS_FUNCTION - void operator()(int) const { - volatile half_type tmp_lhs; - half_type nv_tmp; - - // Initialze output views to catch missing test invocations - for (int i = 0; i < N_OP_TESTS; ++i) { - actual_lhs(i) = 1; - expected_lhs(i) = -1; - } - - nv_tmp = h_lhs; - actual_lhs(ASSIGN) = static_cast<double>(nv_tmp); - expected_lhs(ASSIGN) = d_lhs; - - actual_lhs(LT_H_H) = h_lhs < h_rhs; - expected_lhs(LT_H_H) = d_lhs < d_rhs; - - actual_lhs(LE_H_H) = h_lhs <= h_rhs; - expected_lhs(LE_H_H) = d_lhs <= d_rhs; - - actual_lhs(NEQ) = h_lhs != h_rhs; - expected_lhs(NEQ) = d_lhs != d_rhs; - - actual_lhs(GT_H_H) = h_lhs > h_rhs; - expected_lhs(GT_H_H) = d_lhs > d_rhs; - - actual_lhs(GE_H_H) = h_lhs >= h_rhs; - expected_lhs(GE_H_H) = d_lhs >= d_rhs; - - actual_lhs(EQ) = h_lhs == h_rhs; - expected_lhs(EQ) = d_lhs == d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs += h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CADD_H_H) = static_cast<double>(nv_tmp); - expected_lhs(CADD_H_H) = d_lhs; - expected_lhs(CADD_H_H) += d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs -= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CSUB_H_H) = static_cast<double>(nv_tmp); - expected_lhs(CSUB_H_H) = d_lhs; - expected_lhs(CSUB_H_H) -= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs *= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CMUL_H_H) = static_cast<double>(nv_tmp); - expected_lhs(CMUL_H_H) = d_lhs; - expected_lhs(CMUL_H_H) *= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs /= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CDIV_H_H) = static_cast<double>(nv_tmp); - expected_lhs(CDIV_H_H) = d_lhs; - expected_lhs(CDIV_H_H) /= d_rhs; - } -}; -#endif - template <class view_type, class half_type> struct Functor_TestHalfOperators { half_type h_lhs, h_rhs; @@ -374,10 +296,31 @@ struct Functor_TestHalfOperators { if (std::is_same<view_type, ViewTypeHost>::value) { auto run_on_host = *this; - run_on_host(0); + run_on_host(Batch0{}, 0); + run_on_host(Batch1{}, 0); + run_on_host(Batch2{}, 0); + run_on_host(Batch3{}, 0); + run_on_host(Batch4{}, 0); + run_on_host(Batch5{}, 0); } else { - Kokkos::parallel_for("Test::Functor_TestHalfOperators", - Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this); + Kokkos::parallel_for("Test::Functor_TestHalfOperators_0", + Kokkos::RangePolicy<Batch0, ExecutionSpace>(0, 1), + *this); + Kokkos::parallel_for("Test::Functor_TestHalfOperators_1", + Kokkos::RangePolicy<Batch1, ExecutionSpace>(0, 1), + *this); + Kokkos::parallel_for("Test::Functor_TestHalfOperators_2", + Kokkos::RangePolicy<Batch2, ExecutionSpace>(0, 1), + *this); + Kokkos::parallel_for("Test::Functor_TestHalfOperators_3", + Kokkos::RangePolicy<Batch3, ExecutionSpace>(0, 1), + *this); + Kokkos::parallel_for("Test::Functor_TestHalfOperators_4", + Kokkos::RangePolicy<Batch4, ExecutionSpace>(0, 1), + *this); + Kokkos::parallel_for("Test::Functor_TestHalfOperators_5", + Kokkos::RangePolicy<Batch5, ExecutionSpace>(0, 1), + *this); } } @@ -463,20 +406,20 @@ struct Functor_TestHalfOperators { } // END: Binary Arithmetic test helpers - KOKKOS_FUNCTION - void operator()(int) const { - half_type tmp_lhs, tmp2_lhs, *tmp_ptr; - double tmp_d_lhs; - float tmp_s_lhs; #if !defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT - using half_impl_type = typename half_type::impl_type; + using half_impl_type = typename half_type::impl_type; #else - using half_impl_type = half_type; + using half_impl_type = half_type; #endif // !defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT - half_impl_type half_tmp; - // Initialze output views to catch missing test invocations - for (int i = 0; i < N_OP_TESTS; ++i) { + KOKKOS_FUNCTION + void operator()(Batch0, int) const { + half_type tmp_lhs, tmp2_lhs; + double tmp_d_lhs; + float tmp_s_lhs; + + // Initialize output views to catch missing test invocations + for (int i = 0; i < N_OP_TESTS_BATCH_0; ++i) { actual_lhs(i) = 1; expected_lhs(i) = -1; } @@ -641,6 +584,15 @@ struct Functor_TestHalfOperators { actual_lhs(CDIV_D_H) = tmp_d_lhs; expected_lhs(CDIV_D_H) = d_lhs; expected_lhs(CDIV_D_H) /= d_rhs; + } + + KOKKOS_FUNCTION + void operator()(Batch1, int) const { + // Initialize output views to catch missing test invocations + for (int i = N_OP_TESTS_BATCH_0 + 1; i < N_OP_TESTS_BATCH_1; ++i) { + actual_lhs(i) = 1; + expected_lhs(i) = -1; + } test_add<half_type, half_type, half_type>(ADD_H_H, ADD_H_H_SZ); test_add<float, half_type, float>(ADD_S_H, ADD_S_H_SZ); @@ -697,6 +649,15 @@ struct Functor_TestHalfOperators { actual_lhs(ADD_H_ULLI) = expected_lhs(ADD_H_ULLI); actual_lhs(ADD_H_ULLI_SZ) = expected_lhs(ADD_H_ULLI_SZ); } + } + + KOKKOS_FUNCTION + void operator()(Batch2, int) const { + // Initialize output views to catch missing test invocations + for (int i = N_OP_TESTS_BATCH_1 + 1; i < N_OP_TESTS_BATCH_2; ++i) { + actual_lhs(i) = 1; + expected_lhs(i) = -1; + } test_sub<half_type, half_type, half_type>(SUB_H_H, SUB_H_H_SZ); test_sub<float, half_type, float>(SUB_S_H, SUB_S_H_SZ); @@ -753,6 +714,15 @@ struct Functor_TestHalfOperators { actual_lhs(SUB_H_ULLI) = expected_lhs(SUB_H_ULLI); actual_lhs(SUB_H_ULLI_SZ) = expected_lhs(SUB_H_ULLI_SZ); } + } + + KOKKOS_FUNCTION + void operator()(Batch3, int) const { + // Initialize output views to catch missing test invocations + for (int i = N_OP_TESTS_BATCH_2 + 1; i < N_OP_TESTS_BATCH_3; ++i) { + actual_lhs(i) = 1; + expected_lhs(i) = -1; + } test_mul<half_type, half_type, half_type>(MUL_H_H, MUL_H_H_SZ); test_mul<float, half_type, float>(MUL_S_H, MUL_S_H_SZ); @@ -809,6 +779,15 @@ struct Functor_TestHalfOperators { actual_lhs(MUL_H_ULI_SZ) = expected_lhs(MUL_H_ULI_SZ); actual_lhs(MUL_H_ULLI_SZ) = expected_lhs(MUL_H_ULLI_SZ); } + } + + KOKKOS_FUNCTION + void operator()(Batch4, int) const { + // Initialize output views to catch missing test invocations + for (int i = N_OP_TESTS_BATCH_3 + 1; i < N_OP_TESTS_BATCH_4; ++i) { + actual_lhs(i) = 1; + expected_lhs(i) = -1; + } test_div<half_type, half_type, half_type>(DIV_H_H, DIV_H_H_SZ); test_div<float, half_type, float>(DIV_S_H, DIV_S_H_SZ); @@ -879,6 +858,18 @@ struct Functor_TestHalfOperators { actual_lhs(DIV_H_ULLI) = expected_lhs(DIV_H_ULLI); actual_lhs(DIV_H_ULLI_SZ) = expected_lhs(DIV_H_ULLI_SZ); } + } + + KOKKOS_FUNCTION + void operator()(Batch5, int) const { + half_type tmp_lhs, tmp2_lhs, *tmp_ptr; + half_impl_type half_tmp; + + // Initialize output views to catch missing test invocations + for (int i = N_OP_TESTS_BATCH_4 + 1; i < N_OP_TESTS; ++i) { + actual_lhs(i) = 1; + expected_lhs(i) = -1; + } // TODO: figure out why operator{!,&&,||} are returning __nv_bool actual_lhs(NEG) = static_cast<double>(!h_lhs); @@ -988,39 +979,16 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { - // printf("op_test = %d\n", op_test); - ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - static_cast<double>(epsilon)); - ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - static_cast<double>(epsilon)); - } - -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) - // Test partial volatile support - volatile half_type _h_lhs = h_lhs; - volatile half_type _h_rhs = h_rhs; - Functor_TestHalfVolatileOperators<ViewType, half_type> f_volatile_device( - _h_lhs, _h_rhs); - Functor_TestHalfVolatileOperators<ViewTypeHost, half_type> f_volatile_host( - _h_lhs, _h_rhs); - - ExecutionSpace().fence(); - Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); - Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); - for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { - // printf("op_test = %d\n", op_test); - if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H || - op_test == NEQ || op_test == EQ || op_test == GT_H_H || - op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || - op_test == CMUL_H_H || op_test == CDIV_H_H) { + if (op_test != N_OP_TESTS_BATCH_0 && op_test != N_OP_TESTS_BATCH_1 && + op_test != N_OP_TESTS_BATCH_2 && op_test != N_OP_TESTS_BATCH_3 && + op_test != N_OP_TESTS_BATCH_4) { + // printf("op_test = %d\n", op_test); ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), static_cast<double>(epsilon)); ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), static_cast<double>(epsilon)); } } -#endif // is_trivially_copyable is false with the addition of explicit // copy constructors that are required for supporting reductions diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp index 3ee2ff52051a4fece698b13b3d2ce2f69ef15a26..5bb77a97f7cc60c85a976f805643d6ecedb0ce24 100644 --- a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp +++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -37,7 +37,7 @@ template <class SmartPtr> struct CheckAccessStoredPointerAndDereferenceOnDevice { SmartPtr m_device_ptr; using ElementType = typename SmartPtr::element_type; - static_assert(std::is_same<ElementType, Data>::value, ""); + static_assert(std::is_same_v<ElementType, Data>); CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { @@ -127,8 +127,7 @@ TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) { #endif // FIXME_OPENMPTARGET -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) namespace { struct Bar { @@ -155,7 +154,7 @@ void host_shared_ptr_test_reference_counting() { static_cast<Foo*>(Kokkos::kokkos_malloc<DevMemSpace>(sizeof(Foo))); Kokkos::View<Foo, DevMemSpace> fp_d(fp_d_ptr); // If using UVM or on the CPU don't make an extra HostCopy - Foo* fp_h_ptr = std::is_same<DevMemSpace, HostMemSpace>::value + Foo* fp_h_ptr = std::is_same_v<DevMemSpace, HostMemSpace> ? fp_d_ptr : static_cast<Foo*>( Kokkos::kokkos_malloc<HostMemSpace>(sizeof(Foo))); @@ -243,10 +242,9 @@ TEST(TEST_CATEGORY, host_shared_ptr_tracking) { Kokkos::CudaUVMSpace>(); #endif #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value) - host_shared_ptr_test_reference_counting< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>(); + if (std::is_same<TEST_EXECSPACE, Kokkos::SYCL>::value) + host_shared_ptr_test_reference_counting<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLSharedUSMSpace>(); #endif #ifdef KOKKOS_ENABLE_HIP if (std::is_same<TEST_EXECSPACE, Kokkos::HIP>::value) { @@ -258,4 +256,4 @@ TEST(TEST_CATEGORY, host_shared_ptr_tracking) { #endif } -#endif // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA +#endif diff --git a/packages/kokkos/core/unit_test/TestInit.hpp b/packages/kokkos/core/unit_test/TestInit.hpp index 9a8dba8dc8f6e608e52b37687b050ee354b62755..6bf4295eb74514ed1be54c70a2caa26924414fc9 100644 --- a/packages/kokkos/core/unit_test/TestInit.hpp +++ b/packages/kokkos/core/unit_test/TestInit.hpp @@ -23,8 +23,6 @@ namespace Test { TEST(TEST_CATEGORY, init) { ; } -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - template <class ExecSpace> void test_dispatch() { const int repeat = 100; @@ -37,6 +35,5 @@ void test_dispatch() { } TEST(TEST_CATEGORY, dispatch) { test_dispatch<TEST_EXECSPACE>(); } -#endif } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestInitializationSettings.cpp b/packages/kokkos/core/unit_test/TestInitializationSettings.cpp index f5be0e47aab47f0e050c799e5a47867da65e2de4..cf7a9e403dad4794298612ba5067bfe39f3d3479 100644 --- a/packages/kokkos/core/unit_test/TestInitializationSettings.cpp +++ b/packages/kokkos/core/unit_test/TestInitializationSettings.cpp @@ -20,30 +20,6 @@ namespace { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void take_initialization_settings(Kokkos::InitializationSettings const&) {} - -TEST(defaultdevicetype, - init_arguments_implicit_conversion_to_initialization_settings) { - Kokkos::InitArguments arguments; - take_initialization_settings(arguments); // check that conversion is implicit - arguments.device_id = 1; - arguments.tune_internals = true; - Kokkos::InitializationSettings settings{arguments}; - EXPECT_FALSE(settings.has_num_threads()); - EXPECT_TRUE(settings.has_device_id()); - EXPECT_EQ(settings.get_device_id(), 1); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); - EXPECT_FALSE(settings.has_disable_warnings()); - EXPECT_TRUE(settings.has_tune_internals()); - EXPECT_TRUE(settings.get_tune_internals()); - EXPECT_FALSE(settings.has_tools_help()); - EXPECT_FALSE(settings.has_tools_libs()); - EXPECT_FALSE(settings.has_tools_args()); -} -#endif - TEST(defaultdevicetype, initialization_settings) { auto const settings = Kokkos::InitializationSettings() .set_num_threads(255) @@ -52,8 +28,6 @@ TEST(defaultdevicetype, initialization_settings) { EXPECT_TRUE(settings.has_num_threads()); EXPECT_EQ(settings.get_num_threads(), 255); EXPECT_FALSE(settings.has_device_id()); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); EXPECT_TRUE(settings.has_disable_warnings()); EXPECT_FALSE(settings.get_disable_warnings()); EXPECT_FALSE(settings.has_tune_internals()); @@ -65,18 +39,16 @@ TEST(defaultdevicetype, initialization_settings) { constexpr bool test_initialization_settings_getter() { #define CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(NAME, TYPE) \ - static_assert(std::is_same< \ + static_assert(std::is_same_v< \ decltype(std::declval<Kokkos::InitializationSettings const&>() \ .has_##NAME()), \ - bool>::value); \ - static_assert(std::is_same< \ + bool>); \ + static_assert(std::is_same_v< \ decltype(std::declval<Kokkos::InitializationSettings const&>() \ .get_##NAME()), \ - TYPE>::value); + TYPE>); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool); @@ -88,7 +60,6 @@ constexpr bool test_initialization_settings_getter() { static_assert(test_initialization_settings_getter()); -static_assert( - std::is_default_constructible<Kokkos::InitializationSettings>::value); +static_assert(std::is_default_constructible_v<Kokkos::InitializationSettings>); } // namespace diff --git a/packages/kokkos/core/unit_test/TestInterOp.cpp b/packages/kokkos/core/unit_test/TestInterOp.cpp index b183b13403aab33b7a3341a5d1cff09b2e7f4b02..85bbe407bfdb1c8289c98e93ece93f56b4ae2c42 100644 --- a/packages/kokkos/core/unit_test/TestInterOp.cpp +++ b/packages/kokkos/core/unit_test/TestInterOp.cpp @@ -20,119 +20,120 @@ // View static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t<Kokkos::View<double*>>, Kokkos::View<double*, typename Kokkos::DefaultExecutionSpace::array_layout, typename Kokkos::DefaultExecutionSpace::memory_space, - Kokkos::Experimental::DefaultViewHooks>>::value, + Kokkos::Experimental::DefaultViewHooks>>, "Error! Unexpected python_view_type for: View"); // DynRankView static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<double>>, Kokkos::DynRankView< double, typename Kokkos::DefaultExecutionSpace::array_layout, - typename Kokkos::DefaultExecutionSpace::memory_space>>::value, + typename Kokkos::DefaultExecutionSpace::memory_space>>, "Error! Unexpected python_view_type for: DynRankView"); // View + Execution Space static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t< Kokkos::View<double*, Kokkos::DefaultExecutionSpace>>, Kokkos::View<double*, typename Kokkos::DefaultExecutionSpace::array_layout, typename Kokkos::DefaultExecutionSpace::memory_space, - Kokkos::Experimental::DefaultViewHooks>>::value, + Kokkos::Experimental::DefaultViewHooks>>, "Error! Unexpected python_view_type for: View + Execution Space"); // DynRankView + Execution Space static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t< Kokkos::DynRankView<double, Kokkos::DefaultExecutionSpace>>, Kokkos::DynRankView< double, typename Kokkos::DefaultExecutionSpace::array_layout, - typename Kokkos::DefaultExecutionSpace::memory_space>>::value, + typename Kokkos::DefaultExecutionSpace::memory_space>>, "Error! Unexpected python_view_type for: DynRankView + Execution Space"); // View + Memory space -static_assert( - std::is_same<Kokkos::Experimental::python_view_type_t< - Kokkos::View<int64_t*, Kokkos::HostSpace>>, - Kokkos::View<int64_t*, Kokkos::LayoutRight, Kokkos::HostSpace, - Kokkos::Experimental::DefaultViewHooks>>::value, - "Error! Unexpected python_view_type for: View + Memory space"); +static_assert(std::is_same_v< + Kokkos::Experimental::python_view_type_t< + Kokkos::View<int64_t*, Kokkos::HostSpace>>, + Kokkos::View<int64_t*, Kokkos::LayoutRight, Kokkos::HostSpace, + Kokkos::Experimental::DefaultViewHooks>>, + "Error! Unexpected python_view_type for: View + Memory space"); // DynRankView + Memory space static_assert( - std::is_same<Kokkos::Experimental::python_view_type_t< - Kokkos::DynRankView<int16_t, Kokkos::HostSpace>>, - Kokkos::DynRankView<int16_t, Kokkos::LayoutRight, - Kokkos::HostSpace>>::value, + std::is_same_v< + Kokkos::Experimental::python_view_type_t< + Kokkos::DynRankView<int16_t, Kokkos::HostSpace>>, + Kokkos::DynRankView<int16_t, Kokkos::LayoutRight, Kokkos::HostSpace>>, "Error! Unexpected python_view_type for: DynRankView + Memory space"); // View + Layout + Execution space static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t<Kokkos::View< int**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>, Kokkos::View<int**, Kokkos::LayoutLeft, typename Kokkos::DefaultExecutionSpace::memory_space, - Kokkos::Experimental::DefaultViewHooks>>::value, + Kokkos::Experimental::DefaultViewHooks>>, "Error! Unexpected python_view_type for: View + Layout + Execution space"); // DynRankView + Layout + Execution space static_assert( - std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView< - int, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>, - Kokkos::DynRankView<int, Kokkos::LayoutLeft, - typename Kokkos::DefaultExecutionSpace:: - memory_space>>::value, + std::is_same_v<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView< + int, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>, + Kokkos::DynRankView< + int, Kokkos::LayoutLeft, + typename Kokkos::DefaultExecutionSpace::memory_space>>, "Error! Unexpected python_view_type for: DynRankView + Layout + Execution " "space"); // View + Layout + Memory Space static_assert( - std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::View< - uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>, - Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace, - Kokkos::Experimental::DefaultViewHooks>>::value, + std::is_same_v< + Kokkos::Experimental::python_view_type_t< + Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>, + Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace, + Kokkos::Experimental::DefaultViewHooks>>, "Error! Unexpected python_view_type for: View + Layout + Memory Space"); // DynRankView + Layout + Memory Space static_assert( - std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView< - uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>, - Kokkos::DynRankView<uint64_t, Kokkos::LayoutLeft, - Kokkos::HostSpace>>::value, + std::is_same_v< + Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView< + uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>, + Kokkos::DynRankView<uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>, "Error! Unexpected python_view_type for: DynRankView + Layout + Memory " "Space"); // View + Layout + Execution space + Memory Trait static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t<Kokkos::View< float***, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess>>>, Kokkos::View<float***, Kokkos::LayoutLeft, typename Kokkos::DefaultHostExecutionSpace::memory_space, Kokkos::Experimental::DefaultViewHooks, - Kokkos::MemoryTraits<Kokkos::RandomAccess>>>::value, + Kokkos::MemoryTraits<Kokkos::RandomAccess>>>, "Error! Unexpected python_view_type for: View + Layout + Execution space + " "Memory Trait"); // DynRankView + Layout + Execution space + Memory trait static_assert( - std::is_same< + std::is_same_v< Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView< float, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>>, Kokkos::DynRankView< float, Kokkos::LayoutLeft, typename Kokkos::DefaultHostExecutionSpace::memory_space, - Kokkos::MemoryTraits<Kokkos::Atomic>>>::value, + Kokkos::MemoryTraits<Kokkos::Atomic>>>, "Error! Unexpected python_view_type for: DynRankView + Layout + Execution " "space + Memory trait"); diff --git a/packages/kokkos/core/unit_test/TestIrregularLayout.hpp b/packages/kokkos/core/unit_test/TestIrregularLayout.hpp index c04d2cca3ee65ea16da20af5afc9296f55492f29..93fb3a11036a158cb491cff396df5515ba9f7875 100644 --- a/packages/kokkos/core/unit_test/TestIrregularLayout.hpp +++ b/packages/kokkos/core/unit_test/TestIrregularLayout.hpp @@ -137,8 +137,8 @@ struct ViewOffset<Dimension, Kokkos::LayoutSelective, void> { } //---------------------------------------- - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; KOKKOS_INLINE_FUNCTION diff --git a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp index 24cf52aa7090eabae6dc4434f392349557f31789..efe4a2307a827366423480dfbd87f9c57bf6b055 100644 --- a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, } static_assert((no_error | error_operator_plus_equal_volatile) == - error_operator_plus_equal_volatile, - ""); -static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, ""); + error_operator_plus_equal_volatile); +static_assert((error_join_volatile | error_operator_plus_equal) == 0b101); struct MyJoinBackCompatValueType { MyErrorCode err = no_error; diff --git a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp index 1ee23a47c45671f85e79364c0d857bea1b2a6cba..f4b68a8d2bb6d2deeb4d1795650bb06241f2820a 100644 --- a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp +++ b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp @@ -904,16 +904,9 @@ void impl_test_local_deepcopy_rangepolicy_rank_7(const int N) { } //------------------------------------------------------------------------------------------------------------- -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v<ExecSpace, Kokkos::Cuda>) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif - using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>; + using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>; { // Rank-1 impl_test_local_deepcopy_teampolicy_rank_1<ExecSpace, ViewType>(8); @@ -940,13 +933,7 @@ TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) { //------------------------------------------------------------------------------------------------------------- TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v<ExecSpace, Kokkos::Cuda>) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif - using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>; + using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>; { // Rank-1 impl_test_local_deepcopy_rangepolicy_rank_1<ExecSpace, ViewType>(8); @@ -973,12 +960,6 @@ TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) { //------------------------------------------------------------------------------------------------------------- TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v<ExecSpace, Kokkos::Cuda>) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif using ViewType = Kokkos::View<double********, Kokkos::LayoutRight, ExecSpace>; { // Rank-1 @@ -1006,12 +987,6 @@ TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) { //------------------------------------------------------------------------------------------------------------- TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutright) { using ExecSpace = TEST_EXECSPACE; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if (std::is_same_v<ExecSpace, Kokkos::Cuda>) - GTEST_SKIP() - << "FIXME_NVHPC : Compiler bug affecting subviews of high rank Views"; -#endif using ViewType = Kokkos::View<double********, Kokkos::LayoutRight, ExecSpace>; @@ -1037,7 +1012,6 @@ TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutright) { impl_test_local_deepcopy_rangepolicy_rank_7<ExecSpace, ViewType>(8); } } -#endif namespace Impl { template <typename T, typename SHMEMTYPE> diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp index 3e80e7a01bf0331bbaadbdb963fa5bea97ca7779..7161bbfba5f9d6bb49133e525b57c89e9aca9336 100644 --- a/packages/kokkos/core/unit_test/TestMDRange.hpp +++ b/packages/kokkos/core/unit_test/TestMDRange.hpp @@ -212,7 +212,6 @@ struct TestMDRange_2D { } static void test_reduce2(const int N0, const int N1) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, @@ -231,7 +230,6 @@ struct TestMDRange_2D { sum); ASSERT_EQ(sum, N0 * N1); } -#endif { using range_type = @@ -315,7 +313,6 @@ struct TestMDRange_2D { ASSERT_EQ(sum, 2 * N0 * N1); } // Test Min reducer with lambda -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, @@ -341,7 +338,7 @@ struct TestMDRange_2D { ASSERT_EQ(min, 4.0); } -#endif + // Tagged operator test { using range_type = typename Kokkos::MDRangePolicy< @@ -478,7 +475,6 @@ struct TestMDRange_2D { } // end test_reduce2 static void test_for2(const int N0, const int N1) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, @@ -517,7 +513,6 @@ struct TestMDRange_2D { ASSERT_EQ(counter, 0); } -#endif { using range_type = @@ -846,7 +841,6 @@ struct TestMDRange_3D { } static void test_reduce3(const int N0, const int N1, const int N2) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, @@ -864,7 +858,6 @@ struct TestMDRange_3D { sum); ASSERT_EQ(sum, N0 * N1 * N2); } -#endif { using range_type = @@ -947,7 +940,6 @@ struct TestMDRange_3D { ASSERT_EQ(sum, 2 * N0 * N1 * N2); } // Test Min reducer with lambda -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, @@ -980,7 +972,6 @@ struct TestMDRange_3D { ASSERT_EQ(min, min_identity); } } -#endif // Tagged operator test { @@ -1119,7 +1110,6 @@ struct TestMDRange_3D { } // end test_reduce3 static void test_for3(const int N0, const int N1, const int N2) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, @@ -1162,7 +1152,6 @@ struct TestMDRange_3D { ASSERT_EQ(counter, 0); } -#endif { using range_type = @@ -1473,7 +1462,6 @@ struct TestMDRange_4D { static void test_reduce4(const int N0, const int N1, const int N2, const int N3) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, @@ -1491,7 +1479,6 @@ struct TestMDRange_4D { sum); ASSERT_EQ(sum, N0 * N1 * N2 * N3); } -#endif { using range_type = @@ -1578,7 +1565,6 @@ struct TestMDRange_4D { } // Test Min reducer with lambda -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, @@ -1606,7 +1592,6 @@ struct TestMDRange_4D { ASSERT_EQ(min, 16.0); } -#endif // Tagged operator test { @@ -1748,7 +1733,6 @@ struct TestMDRange_4D { static void test_for4(const int N0, const int N1, const int N2, const int N3) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, @@ -1792,7 +1776,6 @@ struct TestMDRange_4D { ASSERT_EQ(counter, 0); } -#endif { using range_type = @@ -2118,7 +2101,6 @@ struct TestMDRange_5D { static void test_reduce5(const int N0, const int N1, const int N2, const int N3, const int N4) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, @@ -2138,7 +2120,6 @@ struct TestMDRange_5D { sum); ASSERT_EQ(sum, N0 * N1 * N2 * N3 * N4); } -#endif { using range_type = @@ -2231,7 +2212,6 @@ struct TestMDRange_5D { } // Test Min reducer with lambda -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, @@ -2263,7 +2243,6 @@ struct TestMDRange_5D { ASSERT_EQ(min, 32.0); } -#endif // Tagged operator test { @@ -2312,7 +2291,6 @@ struct TestMDRange_5D { static void test_for5(const int N0, const int N1, const int N2, const int N3, const int N4) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, @@ -2360,7 +2338,6 @@ struct TestMDRange_5D { ASSERT_EQ(counter, 0); } -#endif { using range_type = @@ -2706,7 +2683,6 @@ struct TestMDRange_6D { static void test_reduce6(const int N0, const int N1, const int N2, const int N3, const int N4, const int N5) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { #if defined(KOKKOS_COMPILER_INTEL) // Launchbounds causes hang with intel compilers @@ -2735,7 +2711,6 @@ struct TestMDRange_6D { sum); ASSERT_EQ(sum, N0 * N1 * N2 * N3 * N4 * N5); } -#endif { #if defined(KOKKOS_COMPILER_INTEL) @@ -2889,7 +2864,6 @@ struct TestMDRange_6D { } // Test Min reducer with lambda -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { #if defined(KOKKOS_COMPILER_INTEL) // Launchbounds causes hang with intel compilers @@ -2931,7 +2905,6 @@ struct TestMDRange_6D { ASSERT_EQ(min, 64.0); } -#endif // Tagged operator test { @@ -2997,7 +2970,6 @@ struct TestMDRange_6D { static void test_for6(const int N0, const int N1, const int N2, const int N3, const int N4, const int N5) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) { #if defined(KOKKOS_COMPILER_INTEL) // Launchbounds causes hang with intel compilers @@ -3056,7 +3028,6 @@ struct TestMDRange_6D { ASSERT_EQ(counter, 0); } -#endif { #if defined(KOKKOS_COMPILER_INTEL) @@ -3855,7 +3826,6 @@ struct TestMDRange_ReduceScalar { }; static void test_scalar_reduce(const int N0, const int N1) { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Scalar sum; using range_type = typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, @@ -3873,10 +3843,6 @@ struct TestMDRange_ReduceScalar { }, sum); for (int i = 0; i < 4; i++) ASSERT_EQ(sum.v[i], N0 * N1); -#else - std::ignore = N0; - std::ignore = N1; -#endif } }; diff --git a/packages/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp b/packages/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2c3d021c353982a6010ec2f3d86e60c184bbc76 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp @@ -0,0 +1,138 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +namespace { + +struct TestMDRangePolicyCTAD { + template <typename... Ts> + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v<SomeExecutionSpace>); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int t[5]; + [[maybe_unused]] static inline int64_t tt[5]; + [[maybe_unused]] static inline Kokkos::Array<int64_t, 3> a; + [[maybe_unused]] static inline Kokkos::Array<int64_t, 2> aa; + [[maybe_unused]] static inline int64_t i64; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 "declared but never referenced" + TestMDRangePolicyCTAD() { + maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes, i64); + } + + // MDRangePolicy with C array parameters + + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>, + decltype(Kokkos::MDRangePolicy(t, t))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>, + decltype(Kokkos::MDRangePolicy(t, t, tt))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>, + decltype(Kokkos::MDRangePolicy(des, t, tt))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(t)>>, + decltype(Kokkos::MDRangePolicy(notEs, t, t))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<std::size(t)>>, + decltype(Kokkos::MDRangePolicy(ses, t, t))>); + + // MDRangePolicy with Kokkos::initializer_list parameters + + static_assert(std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>); + + static_assert(std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert(std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>, + decltype(Kokkos::MDRangePolicy( + des, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<6>>, + decltype(Kokkos::MDRangePolicy(notEs, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<6>>, + decltype(Kokkos::MDRangePolicy(ses, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + // MDRangePolicy with Kokkos::Array parameters + + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(a, a))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(a, a, aa))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(des, a, a))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(notEs, a, a))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(des, a, a, aa))>); + static_assert( + std::is_same_v<Kokkos::MDRangePolicy<Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(notEs, a, a, aa))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(ses, a, a))>); + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy<SomeExecutionSpace, Kokkos::Rank<std::size(a)>>, + decltype(Kokkos::MDRangePolicy(ses, a, a, aa))>); +}; + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp index f577f415e7cb9486514de4004b88a66536183cc7..389ff8e7773231130a3bbc53d0252fbb59108a99 100644 --- a/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -18,6 +18,8 @@ #include <Kokkos_Core.hpp> +#include <regex> + namespace { template <class IndexType> @@ -86,13 +88,116 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, Kokkos::IndexType<unsigned>>; + std::string msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is " + "performed on a bound (-1) in dimension (0), which may not preserve its " + "original value.\n"; + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ (void)Policy({-1, 0}, {2, 3}); }, expected); +} + +TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { + using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + auto [dim0, dim1] = (Policy::inner_direction == Kokkos::Iterate::Right) + ? std::make_pair(1, 0) + : std::make_pair(0, 1); + std::string msg1 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim0) + ".\n"; + + std::string msg2 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim1) + ".\n"; + +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + // escape the parentheses in the regex to match the error message + msg1 = std::regex_replace(msg1, std::regex("\\(|\\)"), "\\$&"); + (void)msg2; ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { - (void)Policy({-1, 0}, {2, 3}); - }, - "unsafe narrowing conversion"); + ASSERT_DEATH({ (void)Policy({100, 100}, {90, 90}); }, msg1); +#else + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + ::testing::internal::CaptureStderr(); + (void)Policy({100, 100}, {90, 90}); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg1 + msg2); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg1; + (void)msg2; +#endif + +#endif } #endif +TEST(TEST_CATEGORY, policy_get_tile_size) { + constexpr int rank = 3; + using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<rank>>; + using tile_type = typename Policy::tile_type; + + std::size_t last_rank = + (Policy::inner_direction == Kokkos::Iterate::Right) ? rank - 1 : 0; + + auto default_size_properties = + Kokkos::Impl::get_tile_size_properties(TEST_EXECSPACE()); + + { + int dim_length = 100; + Policy policy_default({0, 0, 0}, {dim_length, dim_length, dim_length}); + + auto rec_tile_sizes = policy_default.tile_size_recommended(); + auto internal_tile_sizes = policy_default.m_tile; + + for (std::size_t i = 0; i < rank; ++i) { + EXPECT_EQ(rec_tile_sizes[i], internal_tile_sizes[i]) + << " incorrect recommended tile size returned for rank " << i; + } + } + { + int dim_length = 100; + Policy policy({0, 0, 0}, {dim_length, dim_length, dim_length}, + tile_type{{2, 4, 16}}); + + auto rec_tile_sizes = policy.tile_size_recommended(); + + EXPECT_EQ(default_size_properties.max_total_tile_size, + policy.max_total_tile_size()); + + int prod_rec_tile_size = 1; + for (std::size_t i = 0; i < rank; ++i) { + EXPECT_GT(rec_tile_sizes[i], 0) + << " invalid default tile size for rank " << i; + + if (default_size_properties.default_largest_tile_size == 0) { + auto expected_rec_tile_size = + (i == last_rank) ? dim_length + : default_size_properties.default_tile_size; + EXPECT_EQ(expected_rec_tile_size, rec_tile_sizes[i]) + << " incorrect recommended tile size returned for rank " << i; + } else { + auto expected_rec_tile_size = + (i == last_rank) ? default_size_properties.default_largest_tile_size + : default_size_properties.default_tile_size; + EXPECT_EQ(expected_rec_tile_size, rec_tile_sizes[i]) + << " incorrect recommended tile size returned for rank " << i; + } + + prod_rec_tile_size *= rec_tile_sizes[i]; + } + EXPECT_LT(prod_rec_tile_size, policy.max_total_tile_size()); + } +} + } // namespace diff --git a/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp b/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp index 007fa420c3a8fba78ee89dd81cd031583a795f67..24bd3255fe87628bc2b01bdff54af9dc3b068a5a 100644 --- a/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp +++ b/packages/kokkos/core/unit_test/TestMDRangeReduce.hpp @@ -49,8 +49,6 @@ TEST(TEST_CATEGORY, mdrange_parallel_reduce_primitive_types) { #if defined(KOKKOS_ENABLE_OPENMPTARGET) GTEST_SKIP() << "FIXME OPENMPTARGET Tests of MDRange reduce over values " "smaller than int would fail"; -#elif defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) - GTEST_SKIP() << "Skipped ENABLE_CUDA_LAMBDA"; #else for (int bound : {0, 1, 7, 32, 65, 7000}) { for (int k = 0; k < bound; ++k) { diff --git a/packages/kokkos/core/unit_test/TestMDRange_g.hpp b/packages/kokkos/core/unit_test/TestMDRange_g.hpp index aaa9a91acce7c13f4fb85ff90c0114f2702e3af8..d378e3d6c6bdc4282afb5cbdebb8184fd2f46248 100644 --- a/packages/kokkos/core/unit_test/TestMDRange_g.hpp +++ b/packages/kokkos/core/unit_test/TestMDRange_g.hpp @@ -14,7 +14,7 @@ // //@HEADER -//#include <gtest/gtest.h> +// #include <gtest/gtest.h> #include <Kokkos_Core.hpp> diff --git a/packages/kokkos/core/unit_test/TestMDSpan.hpp b/packages/kokkos/core/unit_test/TestMDSpan.hpp index ef0bea1394a6bb5084be1879062c4b7479c42baa..fa88b547a5f0b0845d26c8d50673ab67bb162e1d 100644 --- a/packages/kokkos/core/unit_test/TestMDSpan.hpp +++ b/packages/kokkos/core/unit_test/TestMDSpan.hpp @@ -35,13 +35,19 @@ void test_mdspan_minimal_functional() { Kokkos::parallel_reduce( "CheckMinimalMDSpan", Kokkos::RangePolicy<TEST_EXECSPACE>(0, N), KOKKOS_LAMBDA(int i, int& err) { +#if !defined(KOKKOS_ENABLE_OPENACC) Kokkos::mdspan<int, Kokkos::dextents<int, 1>> b_mds(a.data(), N); -#ifdef KOKKOS_ENABLE_CXX23 +#endif +#if !defined(KOKKOS_ENABLE_CXX17) && !defined(KOKKOS_ENABLE_CXX20) if (a_mds[i] != i) err++; +#if !defined(KOKKOS_ENABLE_OPENACC) if (b_mds[i] != i) err++; +#endif #else if (a_mds(i) != i) err++; +#if !defined(KOKKOS_ENABLE_OPENACC) if (b_mds(i) != i) err++; +#endif #endif }, errors); diff --git a/packages/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp b/packages/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..04460e6419513efb28dd25a2583cd3a4f8608f80 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDSpanAtomicAccessor.hpp @@ -0,0 +1,112 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include <type_traits> + +#include <gtest/gtest.h> +#ifndef KOKKOS_ENABLE_CXX17 +#include <concepts> +#endif + +template <class T, class ExecutionSpace> +void test_atomic_accessor() { + using value_type = std::remove_const_t<T>; + Kokkos::View<value_type*, ExecutionSpace> v("V", 100); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecutionSpace>(0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = i; }); + + int errors; + using acc_t = Kokkos::Impl::AtomicAccessorRelaxed<T>; + acc_t acc{}; + typename acc_t::data_handle_type ptr = v.data(); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecutionSpace>(0, v.extent(0)), + KOKKOS_LAMBDA(int i, int& error) { + if (acc.access(ptr, i) != ptr[i]) error++; + if (acc.offset(ptr, i) != ptr + i) error++; + static_assert(std::is_same_v<typename acc_t::element_type, T>); + static_assert( + std::is_same_v<typename acc_t::reference, + desul::AtomicRef<T, desul::MemoryOrderRelaxed, + desul::MemoryScopeDevice>>); + static_assert(std::is_same_v<typename acc_t::data_handle_type, T*>); + static_assert(std::is_same_v<typename acc_t::offset_policy, acc_t>); + static_assert(std::is_same_v<decltype(acc.access(ptr, i)), + typename acc_t::reference>); + static_assert(std::is_same_v<decltype(acc.offset(ptr, i)), T*>); + static_assert(std::is_nothrow_move_constructible_v<acc_t>); + static_assert(std::is_nothrow_move_assignable_v<acc_t>); + static_assert(std::is_nothrow_swappable_v<acc_t>); + static_assert(std::is_trivially_copyable_v<acc_t>); + static_assert(std::is_trivially_default_constructible_v<acc_t>); + static_assert(std::is_trivially_constructible_v<acc_t>); + static_assert(std::is_trivially_move_constructible_v<acc_t>); + static_assert(std::is_trivially_assignable_v<acc_t, acc_t>); + static_assert(std::is_trivially_move_assignable_v<acc_t>); +#ifndef KOKKOS_ENABLE_CXX17 + static_assert(std::copyable<acc_t>); + static_assert(std::is_empty_v<acc_t>); +#endif + }, + errors); + ASSERT_EQ(errors, 0); +} + +void test_atomic_accessor_conversion() { + using ExecutionSpace = TEST_EXECSPACE; + using T = float; + using acc_t = Kokkos::Impl::AtomicAccessorRelaxed<T>; + using const_acc_t = Kokkos::Impl::AtomicAccessorRelaxed<const T>; + using int_acc_t = Kokkos::Impl::AtomicAccessorRelaxed<int>; + using defacc_t = Kokkos::default_accessor<T>; + using const_defacc_t = Kokkos::default_accessor<const T>; + using int_defacc_t = Kokkos::default_accessor<int>; + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) { + static_assert(std::is_constructible_v<const_acc_t, acc_t>); + static_assert(std::is_convertible_v<acc_t, const_acc_t>); + static_assert(!std::is_constructible_v<acc_t, const_acc_t>); + static_assert(!std::is_constructible_v<acc_t, int_acc_t>); + static_assert(std::is_constructible_v<defacc_t, acc_t>); + static_assert(std::is_constructible_v<acc_t, defacc_t>); + static_assert(!std::is_constructible_v<int_defacc_t, acc_t>); + static_assert(!std::is_constructible_v<int_acc_t, defacc_t>); + static_assert(std::is_constructible_v<const_defacc_t, const_acc_t>); + static_assert(std::is_constructible_v<const_acc_t, const_defacc_t>); + static_assert(std::is_constructible_v<const_defacc_t, acc_t>); + static_assert(std::is_constructible_v<const_acc_t, defacc_t>); + static_assert(!std::is_constructible_v<defacc_t, const_acc_t>); + static_assert(!std::is_constructible_v<acc_t, const_defacc_t>); + static_assert(std::is_convertible_v<acc_t, const_acc_t>); + static_assert(std::is_convertible_v<defacc_t, acc_t>); + static_assert(std::is_convertible_v<defacc_t, const_acc_t>); + static_assert(std::is_convertible_v<const_defacc_t, const_acc_t>); + static_assert(!std::is_convertible_v<acc_t, defacc_t>); + static_assert(!std::is_convertible_v<acc_t, const_defacc_t>); + static_assert(!std::is_convertible_v<const_acc_t, const_defacc_t>); + }); +} + +TEST(TEST_CATEGORY, mdspan_atomic_accessor) { + using ExecutionSpace = TEST_EXECSPACE; + test_atomic_accessor<int, ExecutionSpace>(); + test_atomic_accessor<double, ExecutionSpace>(); +} diff --git a/packages/kokkos/core/unit_test/TestMDSpanConversion.hpp b/packages/kokkos/core/unit_test/TestMDSpanConversion.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b6b6acd800c541189ac17280a64fcc9aabdcc265 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDSpanConversion.hpp @@ -0,0 +1,558 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> +#include <type_traits> + +#include <Kokkos_Core.hpp> +#include "experimental/__p0009_bits/layout_stride.hpp" + +namespace { + +template <class T, class ExecutionSpace> +struct TestViewMDSpanConversion { + using value_type = T; + + template <std::size_t Padding> + using layout_left_padded = Kokkos::Experimental::layout_left_padded<Padding>; + + template <std::size_t Padding> + using layout_right_padded = + Kokkos::Experimental::layout_right_padded<Padding>; + + struct TestAccessor { + using offset_policy = TestAccessor; + using element_type = value_type; + using reference = element_type &; + using data_handle_type = element_type *; + + constexpr TestAccessor() noexcept = default; + constexpr reference access(data_handle_type p, std::size_t i) noexcept { + return p[i]; + } + constexpr data_handle_type offset(data_handle_type p, + std::size_t i) noexcept { + return p + i; + } + }; + + template <class KokkosLayout, class DataType, class MDSpanLayoutMapping, + class... RefViewProps> + static void test_conversion_from_mdspan( + Kokkos::View<DataType, RefViewProps...> ref, + const MDSpanLayoutMapping &mapping) { + using unmanaged_view_type = + Kokkos::View<DataType, KokkosLayout, ExecutionSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged>>; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename unmanaged_view_type::traits>::mdspan_type; + using mapping_type = MDSpanLayoutMapping; + using mdspan_layout_type = typename MDSpanLayoutMapping::layout_type; + using extents_type = typename mapping_type::extents_type; + using mdspan_type = + Kokkos::mdspan<value_type, extents_type, mdspan_layout_type>; + + static_assert(std::is_constructible_v<natural_mdspan_type, mdspan_type>); + static_assert(std::is_convertible_v<mdspan_type, natural_mdspan_type> == + std::is_convertible_v<mdspan_type, unmanaged_view_type>); + // Manually create an mdspan from ref so we have a valid pointer to play + // with + const auto &exts = mapping.extents(); + auto mds = mdspan_type{ref.data(), mapping}; + + auto test_view = unmanaged_view_type(mds); + + ASSERT_EQ(test_view.data(), ref.data()); + ASSERT_EQ(test_view.data(), mds.data_handle()); + ASSERT_EQ(test_view.layout(), ref.layout()); + for (std::size_t r = 0; r < mdspan_type::rank(); ++r) { + ASSERT_EQ(test_view.extent(r), ref.extent(r)); + ASSERT_EQ(test_view.extent(r), exts.extent(r)); + ASSERT_EQ(test_view.stride(r), ref.stride(r)); + ASSERT_EQ(test_view.stride(r), mapping.stride(r)); + } + } + + template <class MDSpanLayoutMapping, class ViewType> + static void test_conversion_to_mdspan( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v) { + using view_type = ViewType; + using natural_mdspan_type = typename Kokkos::Impl::MDSpanViewTraits< + typename view_type::traits>::mdspan_type; + + static_assert(natural_mdspan_type::rank() == view_type::rank); + static_assert(std::is_same_v<typename natural_mdspan_type::value_type, + typename view_type::value_type>); + constexpr bool is_strided_layout = + std::is_same_v<typename MDSpanLayoutMapping::layout_type, + Kokkos::layout_stride>; + if constexpr (!is_strided_layout) { + static_assert(natural_mdspan_type::mapping_type::padding_value == + Kokkos::dynamic_extent); + } + // test conversion operator to natural mdspan + { + natural_mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + + if constexpr (!is_strided_layout && natural_mdspan_type::rank() > 1) { + ASSERT_EQ(cvt.mapping().stride(1), ref_layout_mapping.stride(1)); + } + } + // test to_mdspan() returning natural mdspan + { + auto cvt = v.to_mdspan(); + static_assert(std::is_same_v<natural_mdspan_type, decltype(cvt)>); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + // test conversion operator to different mdspan type + { + using element_type = const typename natural_mdspan_type::element_type; + using const_acc_type = Kokkos::Impl::SpaceAwareAccessor< + typename ViewType::memory_space, + Kokkos::default_accessor<element_type>>; + using mdspan_type = Kokkos::mdspan< + element_type, + Kokkos::dextents<typename natural_mdspan_type::index_type, + natural_mdspan_type::rank()>, + typename natural_mdspan_type::layout_type, const_acc_type>; + mdspan_type cvt = v; + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + } + + template <class MDSpanLayoutMapping, class ViewType, class AccessorType> + static void test_conversion_to_mdspan_with_accessor( + const MDSpanLayoutMapping &ref_layout_mapping, ViewType v, + const AccessorType &a) { + auto cvt = v.to_mdspan(a); + static_assert(decltype(cvt)::rank() == ViewType::rank); + static_assert(std::is_same_v<typename decltype(cvt)::value_type, + typename ViewType::value_type>); + ASSERT_EQ(cvt.data_handle(), v.data()); + ASSERT_EQ(cvt.mapping(), ref_layout_mapping); + } + + template <typename ViewType> + using natural_mdspan_type_for_view = typename Kokkos::Impl::MDSpanViewTraits< + typename ViewType::traits>::mdspan_type; + + static void run_test() { + // Verify we can only convert to compatible mdspans + static_assert(std::is_convertible_v< + Kokkos::View<value_type *>, + natural_mdspan_type_for_view<Kokkos::View<value_type *>>>); + static_assert( + std::is_convertible_v< + Kokkos::View<value_type *>, + natural_mdspan_type_for_view<Kokkos::View<const value_type *>>>); + + // Do not cast const away + static_assert(!std::is_convertible_v< + Kokkos::View<const value_type *>, + natural_mdspan_type_for_view<Kokkos::View<value_type *>>>); + + // Mismatched dim + static_assert(!std::is_convertible_v< + Kokkos::View<value_type *>, + natural_mdspan_type_for_view<Kokkos::View<value_type **>>>); + + // Mismatched layouts + static_assert( + !std::is_convertible_v<Kokkos::View<value_type **, Kokkos::LayoutLeft>, + natural_mdspan_type_for_view<Kokkos::View< + value_type **, Kokkos::LayoutRight>>>); + static_assert( + !std::is_convertible_v<Kokkos::View<value_type **, Kokkos::LayoutRight>, + natural_mdspan_type_for_view<Kokkos::View< + value_type **, Kokkos::LayoutLeft>>>); + // nvcc doesn't do CTAD properly here, making this way more verbose.. + // LayoutLeft + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type *, Kokkos::LayoutLeft, ExecutionSpace>("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 1>>{ + Kokkos::dextents<std::size_t, 1>(7)}); + + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type[7], Kokkos::LayoutLeft, ExecutionSpace>("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents<std::size_t, 7>>{ + Kokkos::extents<std::size_t, 7>()}); + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type[7], Kokkos::LayoutLeft, ExecutionSpace>("ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 1>>{ + Kokkos::dextents<std::size_t, 1>(7)}); + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type *, Kokkos::LayoutLeft, ExecutionSpace>("ref", + 7), + typename layout_left_padded<7>::template mapping< + Kokkos::extents<std::size_t, 7>>{ + Kokkos::extents<std::size_t, 7>()}); + + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type **, Kokkos::LayoutLeft, ExecutionSpace>("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 2>>{ + Kokkos::dextents<std::size_t, 2>(7, 3)}); + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type[7][3], Kokkos::LayoutLeft, ExecutionSpace>( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::extents<std::size_t, 7, 3>>{ + Kokkos::extents<std::size_t, 7, 3>()}); + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type[7][3], Kokkos::LayoutLeft, ExecutionSpace>( + "ref"), + typename layout_left_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 2>>{ + Kokkos::dextents<std::size_t, 2>(7, 3)}); + test_conversion_from_mdspan<Kokkos::LayoutLeft>( + Kokkos::View<value_type **, Kokkos::LayoutLeft, ExecutionSpace>("ref", + 7, 3), + typename layout_left_padded<7>::template mapping< + Kokkos::extents<std::size_t, 7, 3>>{ + Kokkos::extents<std::size_t, 7, 3>()}); + + // LayoutRight + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type *, Kokkos::LayoutRight, ExecutionSpace>("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 1>>{ + Kokkos::dextents<std::size_t, 1>(7)}); + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type[7], Kokkos::LayoutRight, ExecutionSpace>("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents<std::size_t, 7>>{ + Kokkos::extents<std::size_t, 7>()}); + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type[7], Kokkos::LayoutRight, ExecutionSpace>("ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 1>>{ + Kokkos::dextents<std::size_t, 1>(7)}); + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type *, Kokkos::LayoutRight, ExecutionSpace>("ref", + 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents<std::size_t, 7>>{ + Kokkos::extents<std::size_t, 7>()}); + + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type **, Kokkos::LayoutRight, ExecutionSpace>("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 2>>{ + Kokkos::dextents<std::size_t, 2>(3, 7)}); + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type[3][7], Kokkos::LayoutRight, ExecutionSpace>( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::extents<std::size_t, 3, 7>>{ + Kokkos::extents<std::size_t, 3, 7>()}); + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type[3][7], Kokkos::LayoutRight, ExecutionSpace>( + "ref"), + typename layout_right_padded<7>::template mapping< + Kokkos::dextents<std::size_t, 2>>{ + Kokkos::dextents<std::size_t, 2>(3, 7)}); + test_conversion_from_mdspan<Kokkos::LayoutRight>( + Kokkos::View<value_type **, Kokkos::LayoutRight, ExecutionSpace>("ref", + 3, 7), + typename layout_right_padded<7>::template mapping< + Kokkos::extents<std::size_t, 3, 7>>{ + Kokkos::extents<std::size_t, 3, 7>()}); + + // LayoutStride + { + const size_t strides[] = {2}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type *, Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping<Kokkos::dextents<std::size_t, 1>>{ + Kokkos::mdspan_non_standard, Kokkos::dextents<std::size_t, 1>{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type[7], Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 7>>{ + Kokkos::mdspan_non_standard, {}, strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type[7], Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping<Kokkos::dextents<std::size_t, 1>>{ + Kokkos::mdspan_non_standard, Kokkos::dextents<std::size_t, 1>{7}, + strides}); + } + { + const size_t strides[] = {2}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type *, Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2}), + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 7>>{ + Kokkos::mdspan_non_standard, Kokkos::extents<std::size_t, 7>(), + strides}); + } + + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type **, Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping<Kokkos::dextents<std::size_t, 2>>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents<std::size_t, 2>(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type[7][3], Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 7, 3>>{ + Kokkos::mdspan_non_standard, Kokkos::extents<std::size_t, 7, 3>(), + strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type[7][3], Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping<Kokkos::dextents<std::size_t, 2>>{ + Kokkos::mdspan_non_standard, + Kokkos::dextents<std::size_t, 2>(7, 3), strides}); + } + { + const size_t strides[] = {2, 4}; + test_conversion_from_mdspan<Kokkos::LayoutStride>( + Kokkos::View<value_type **, Kokkos::LayoutStride, ExecutionSpace>( + "ref", Kokkos::LayoutStride{7, 2, 3, 4}), + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 7, 3>>{ + Kokkos::mdspan_non_standard, Kokkos::extents<std::size_t, 7, 3>(), + strides}); + } + + // Conversion to mdspan + test_conversion_to_mdspan( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4>>({}, 4), + Kokkos::View<value_type *, Kokkos::LayoutLeft, ExecutionSpace>("v", 4)); + test_conversion_to_mdspan( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4, 7>>({}, 4), + Kokkos::View<value_type **, Kokkos::LayoutLeft, ExecutionSpace>("v", 4, + 7)); + + test_conversion_to_mdspan( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4>>({}, 4), + Kokkos::View<value_type *, Kokkos::LayoutRight, ExecutionSpace>("v", + 4)); + test_conversion_to_mdspan( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4, 7>>({}, 7), + Kokkos::View<value_type **, Kokkos::LayoutRight, ExecutionSpace>("v", 4, + 7)); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 4>>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View<value_type *, Kokkos::LayoutStride, ExecutionSpace>( + "v", Kokkos::LayoutStride{4, 5})); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan( + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 4, 7>>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View<value_type **, Kokkos::LayoutStride, ExecutionSpace>( + "v", Kokkos::LayoutStride{4, 5, 7, 9})); + } + + // Aligned types (for padded layouts) + test_conversion_to_mdspan( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 127, 7>>({}, 128), + Kokkos::View<value_type **, Kokkos::LayoutLeft, ExecutionSpace>( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 127, 7)); + + test_conversion_to_mdspan( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 7, 127>>({}, 128), + Kokkos::View<value_type **, Kokkos::LayoutRight, ExecutionSpace>( + Kokkos::view_alloc("v", Kokkos::AllowPadding), 7, 127)); + + // Conversion with standard default_accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4>>({}, 4), + Kokkos::View<value_type *, Kokkos::LayoutLeft, ExecutionSpace>("v", 4), + Kokkos::default_accessor<value_type>{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4, 7>>({}, 4), + Kokkos::View<value_type **, Kokkos::LayoutLeft, ExecutionSpace>("v", 4, + 7), + Kokkos::default_accessor<value_type>{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4>>({}, 4), + Kokkos::View<value_type *, Kokkos::LayoutRight, ExecutionSpace>("v", 4), + Kokkos::default_accessor<value_type>{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4, 7>>({}, 7), + Kokkos::View<value_type **, Kokkos::LayoutRight, ExecutionSpace>("v", 4, + 7), + Kokkos::default_accessor<value_type>{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 4>>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View<value_type *, Kokkos::LayoutStride, ExecutionSpace>( + "v", Kokkos::LayoutStride{4, 5}), + Kokkos::default_accessor<value_type>{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 4, 7>>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View<value_type **, Kokkos::LayoutStride, ExecutionSpace>( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + Kokkos::default_accessor<value_type>{}); + } + + // Conversion with a test accessor + + test_conversion_to_mdspan_with_accessor( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4>>({}, 4), + Kokkos::View<value_type *, Kokkos::LayoutLeft, ExecutionSpace>("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_left_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4, 7>>({}, 4), + Kokkos::View<value_type **, Kokkos::LayoutLeft, ExecutionSpace>("v", 4, + 7), + TestAccessor{}); + + test_conversion_to_mdspan_with_accessor( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4>>({}, 4), + Kokkos::View<value_type *, Kokkos::LayoutRight, ExecutionSpace>("v", 4), + TestAccessor{}); + test_conversion_to_mdspan_with_accessor( + layout_right_padded<Kokkos::dynamic_extent>::mapping< + Kokkos::extents<std::size_t, 4, 7>>({}, 7), + Kokkos::View<value_type **, Kokkos::LayoutRight, ExecutionSpace>("v", 4, + 7), + TestAccessor{}); + + { + const size_t strides[] = {5}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 4>>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View<value_type *, Kokkos::LayoutStride, ExecutionSpace>( + "v", Kokkos::LayoutStride{4, 5}), + TestAccessor{}); + } + { + const size_t strides[] = {5, 9}; + test_conversion_to_mdspan_with_accessor( + Kokkos::layout_stride::mapping<Kokkos::extents<std::size_t, 4, 7>>( + Kokkos::mdspan_non_standard, {}, strides), + Kokkos::View<value_type **, Kokkos::LayoutStride, ExecutionSpace>( + "v", Kokkos::LayoutStride{4, 5, 7, 9}), + TestAccessor{}); + } + } +}; + +TEST(TEST_CATEGORY, view_mdspan_conversion) { + TestViewMDSpanConversion<double, TEST_EXECSPACE>::run_test(); + TestViewMDSpanConversion<float, TEST_EXECSPACE>::run_test(); + TestViewMDSpanConversion<int, TEST_EXECSPACE>::run_test(); +} + +TEST(TEST_CATEGORY, view_mdspan_conversion_with_stride) { + { + Kokkos::View<int ***, Kokkos::LayoutLeft> source("S", 20, 40, 70); + auto sub_v = Kokkos::subview(source, Kokkos::pair{5, 15}, Kokkos::ALL(), + Kokkos::pair{2, 38}); + auto sub_mds = sub_v.to_mdspan(); + Kokkos::View<int ***, Kokkos::LayoutLeft> sub_v2(sub_mds); + ASSERT_EQ(static_cast<int>(sub_v.extent(0)), 10); + ASSERT_EQ(static_cast<int>(sub_v.extent(1)), 40); + ASSERT_EQ(static_cast<int>(sub_v.extent(2)), 36); + ASSERT_EQ(static_cast<int>(sub_v.stride(0)), 1); + ASSERT_EQ(static_cast<int>(sub_v.stride(1)), 20); + ASSERT_EQ(static_cast<int>(sub_v.stride(2)), 800); + ASSERT_EQ(static_cast<int>(sub_mds.extent(0)), 10); + ASSERT_EQ(static_cast<int>(sub_mds.extent(1)), 40); + ASSERT_EQ(static_cast<int>(sub_mds.extent(2)), 36); + ASSERT_EQ(static_cast<int>(sub_mds.stride(0)), 1); + ASSERT_EQ(static_cast<int>(sub_mds.stride(1)), 20); + ASSERT_EQ(static_cast<int>(sub_mds.stride(2)), 800); + ASSERT_EQ(static_cast<int>(sub_v2.extent(0)), 10); + ASSERT_EQ(static_cast<int>(sub_v2.extent(1)), 40); + ASSERT_EQ(static_cast<int>(sub_v2.extent(2)), 36); + ASSERT_EQ(static_cast<int>(sub_v2.stride(0)), 1); + ASSERT_EQ(static_cast<int>(sub_v2.stride(1)), 20); + ASSERT_EQ(static_cast<int>(sub_v2.stride(2)), 800); + } + { + // layout_right_padded<dynamic_extent> has a custom stride for + // stride(rank-2) LayoutRight has a custom stride for stride(0) That means + // the "padding" only matches up for Rank-2 Views + Kokkos::View<int **, Kokkos::LayoutRight> source("S", 20, 40); + auto sub_v = + Kokkos::subview(source, Kokkos::pair{5, 15}, Kokkos::pair{2, 38}); + auto sub_mds = sub_v.to_mdspan(); + Kokkos::View<int **, Kokkos::LayoutRight> sub_v2(sub_mds); + ASSERT_EQ(static_cast<int>(sub_v.extent(0)), 10); + ASSERT_EQ(static_cast<int>(sub_v.extent(1)), 36); + ASSERT_EQ(static_cast<int>(sub_v.stride(0)), 40); + ASSERT_EQ(static_cast<int>(sub_v.stride(1)), 1); + ASSERT_EQ(static_cast<int>(sub_mds.extent(0)), 10); + ASSERT_EQ(static_cast<int>(sub_mds.extent(1)), 36); + ASSERT_EQ(static_cast<int>(sub_mds.stride(0)), 40); + ASSERT_EQ(static_cast<int>(sub_mds.stride(1)), 1); + ASSERT_EQ(static_cast<int>(sub_v2.extent(0)), 10); + ASSERT_EQ(static_cast<int>(sub_v2.extent(1)), 36); + ASSERT_EQ(static_cast<int>(sub_v2.stride(0)), 40); + ASSERT_EQ(static_cast<int>(sub_v2.stride(1)), 1); + } +} +} // namespace diff --git a/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp b/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp index e446d8132101639cab785e6f078411894c409d84..f52bfeaff7d97f35f6892d3e96a624a698707763 100644 --- a/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp +++ b/packages/kokkos/core/unit_test/TestMathematicalConstants.hpp @@ -63,8 +63,7 @@ struct TestMathematicalConstants { KOKKOS_FUNCTION void use_on_device() const { #if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ - defined(KOKKOS_ENABLE_OPENACC) || \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 + defined(KOKKOS_ENABLE_OPENACC) take_by_value(Trait::value); #else (void)take_address_of(Trait::value); diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp index d32ef4ca230d7b32e340aaf1ca66098446509f59..694993d94cc8957ab81cf317226013f655f88988 100644 --- a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp +++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp @@ -30,8 +30,9 @@ #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS #endif -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable(); #else #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE @@ -273,34 +274,33 @@ struct FloatingPointComparison { #endif // Using absolute here instead of abs, since we actually test abs ... template <class T> - KOKKOS_FUNCTION std::enable_if_t<std::is_signed<T>::value, T> absolute( + KOKKOS_FUNCTION std::enable_if_t<std::is_signed_v<T>, T> absolute( T val) const { return val < T(0) ? -val : val; } template <class T> - KOKKOS_FUNCTION std::enable_if_t<!std::is_signed<T>::value, T> absolute( + KOKKOS_FUNCTION std::enable_if_t<!std::is_signed_v<T>, T> absolute( T val) const { return val; } public: template <class FPT> - KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const { + KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, int ulp) const { auto abs_tol = eps(fpv) * ulp; bool ar = absolute(fpv) < abs_tol; if (!ar) { Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n", - (double)fpv, abs_tol); + (double)fpv, (double)abs_tol); } return ar; } template <class Lhs, class Rhs> - KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, - double ulp) const { + KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, int ulp) const { if (lhs == 0) { return compare_near_zero(rhs, ulp); } else if (rhs == 0) { @@ -314,7 +314,7 @@ struct FloatingPointComparison { bool ar = abs_diff == 0 || rel_diff < rel_tol; if (!ar) { Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n", - (double)rel_diff, rel_tol); + (double)rel_diff, (double)rel_tol); } return ar; @@ -325,60 +325,56 @@ struct FloatingPointComparison { template <class> struct math_function_name; -#define DEFINE_UNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR) \ - struct MathUnaryFunction_##FUNC { \ - template <typename T> \ - static KOKKOS_FUNCTION auto eval(T x) { \ - static_assert( \ - std::is_same<decltype(Kokkos::FUNC((T)0)), \ - math_unary_function_return_type_t<T>>::value); \ - return Kokkos::FUNC(x); \ - } \ - template <typename T> \ - static auto eval_std(T x) { \ - if constexpr (std::is_same<T, KE::half_t>::value || \ - std::is_same<T, KE::bhalf_t>::value) { \ - return std::FUNC(static_cast<float>(x)); \ - } else { \ - static_assert( \ - std::is_same<decltype(std::FUNC((T)0)), \ - math_unary_function_return_type_t<T>>::value); \ - return std::FUNC(x); \ - } \ - MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ - } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ - }; \ - using kk_##FUNC = MathUnaryFunction_##FUNC; \ - template <> \ - struct math_function_name<MathUnaryFunction_##FUNC> { \ - static constexpr char name[] = #FUNC; \ - }; \ +#define DEFINE_UNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR) \ + struct MathUnaryFunction_##FUNC { \ + template <typename T> \ + static KOKKOS_FUNCTION auto eval(T x) { \ + static_assert(std::is_same_v<decltype(Kokkos::FUNC((T)0)), \ + math_unary_function_return_type_t<T>>); \ + return Kokkos::FUNC(x); \ + } \ + template <typename T> \ + static auto eval_std(T x) { \ + if constexpr (std::is_same_v<T, KE::half_t> || \ + std::is_same_v<T, KE::bhalf_t>) { \ + return std::FUNC(static_cast<float>(x)); \ + } else { \ + static_assert(std::is_same_v<decltype(std::FUNC((T)0)), \ + math_unary_function_return_type_t<T>>); \ + return std::FUNC(x); \ + } \ + MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ + } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ + }; \ + using kk_##FUNC = MathUnaryFunction_##FUNC; \ + template <> \ + struct math_function_name<MathUnaryFunction_##FUNC> { \ + static constexpr char name[] = #FUNC; \ + }; \ constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[] -#define DEFINE_UNARY_FUNCTION_EVAL_CUSTOM(FUNC, ULP_FACTOR, REF_FUNC) \ - struct MathUnaryFunction_##FUNC { \ - template <typename T> \ - static KOKKOS_FUNCTION auto eval(T x) { \ - static_assert( \ - std::is_same<decltype(Kokkos::FUNC((T)0)), \ - math_unary_function_return_type_t<T>>::value); \ - return Kokkos::FUNC(x); \ - } \ - template <typename T> \ - static auto eval_std(T x) { \ - static_assert( \ - std::is_same<decltype(REF_FUNC), \ - math_unary_function_return_type_t<T>>::value); \ - return REF_FUNC; \ - } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ - }; \ - using kk_##FUNC = MathUnaryFunction_##FUNC; \ - template <> \ - struct math_function_name<MathUnaryFunction_##FUNC> { \ - static constexpr char name[] = #FUNC; \ - }; \ +#define DEFINE_UNARY_FUNCTION_EVAL_CUSTOM(FUNC, ULP_FACTOR, REF_FUNC) \ + struct MathUnaryFunction_##FUNC { \ + template <typename T> \ + static KOKKOS_FUNCTION auto eval(T x) { \ + static_assert(std::is_same_v<decltype(Kokkos::FUNC((T)0)), \ + math_unary_function_return_type_t<T>>); \ + return Kokkos::FUNC(x); \ + } \ + template <typename T> \ + static auto eval_std(T x) { \ + static_assert(std::is_same_v<decltype(REF_FUNC), \ + math_unary_function_return_type_t<T>>); \ + return REF_FUNC; \ + } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ + }; \ + using kk_##FUNC = MathUnaryFunction_##FUNC; \ + template <> \ + struct math_function_name<MathUnaryFunction_##FUNC> { \ + static constexpr char name[] = #FUNC; \ + }; \ constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[] #ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_3 @@ -394,10 +390,12 @@ DEFINE_UNARY_FUNCTION_EVAL(log2, 2); DEFINE_UNARY_FUNCTION_EVAL(log1p, 2); #endif -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2); DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2); +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 DEFINE_UNARY_FUNCTION_EVAL(sin, 2); DEFINE_UNARY_FUNCTION_EVAL(cos, 2); DEFINE_UNARY_FUNCTION_EVAL(tan, 2); @@ -449,9 +447,8 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); struct MathBinaryFunction_##FUNC { \ template <typename T, typename U> \ static KOKKOS_FUNCTION auto eval(T x, U y) { \ - static_assert( \ - std::is_same<decltype(Kokkos::FUNC((T)0, (U)0)), \ - math_binary_function_return_type_t<T, U>>::value); \ + static_assert(std::is_same_v<decltype(Kokkos::FUNC((T)0, (U)0)), \ + math_binary_function_return_type_t<T, U>>); \ return Kokkos::FUNC(x, y); \ } \ template <typename T, typename U> \ @@ -468,13 +465,13 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); return std::FUNC(x, static_cast<float>(y)); \ else { \ static_assert( \ - std::is_same<decltype(std::FUNC((T)0, (U)0)), \ - math_binary_function_return_type_t<T, U>>::value); \ + std::is_same_v<decltype(std::FUNC((T)0, (U)0)), \ + math_binary_function_return_type_t<T, U>>); \ return std::FUNC(x, y); \ } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathBinaryFunction_##FUNC; \ template <> \ @@ -483,43 +480,41 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); }; \ constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_BINARY_FUNCTION_EVAL(pow, 2); DEFINE_BINARY_FUNCTION_EVAL(hypot, 2); -#endif -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1); DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); #endif #undef DEFINE_BINARY_FUNCTION_EVAL -#define DEFINE_TERNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR) \ - struct MathTernaryFunction_##FUNC { \ - template <typename T, typename U, typename V> \ - static KOKKOS_FUNCTION auto eval(T x, U y, V z) { \ - static_assert( \ - std::is_same<decltype(Kokkos::FUNC((T)0, (U)0, (V)0)), \ - math_ternary_function_return_type_t<T, U, V>>::value); \ - return Kokkos::FUNC(x, y, z); \ - } \ - template <typename T, typename U, typename V> \ - static auto eval_std(T x, U y, V z) { \ - static_assert( \ - std::is_same<decltype(std::FUNC((T)0, (U)0, (V)0)), \ - math_ternary_function_return_type_t<T, U, V>>::value); \ - return std::FUNC(x, y, z); \ - } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ - }; \ - using kk3_##FUNC = MathTernaryFunction_##FUNC; \ - template <> \ - struct math_function_name<MathTernaryFunction_##FUNC> { \ - static constexpr char name[] = #FUNC; \ - }; \ +#define DEFINE_TERNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR) \ + struct MathTernaryFunction_##FUNC { \ + template <typename T, typename U, typename V> \ + static KOKKOS_FUNCTION auto eval(T x, U y, V z) { \ + static_assert( \ + std::is_same_v<decltype(Kokkos::FUNC((T)0, (U)0, (V)0)), \ + math_ternary_function_return_type_t<T, U, V>>); \ + return Kokkos::FUNC(x, y, z); \ + } \ + template <typename T, typename U, typename V> \ + static auto eval_std(T x, U y, V z) { \ + static_assert( \ + std::is_same_v<decltype(std::FUNC((T)0, (U)0, (V)0)), \ + math_ternary_function_return_type_t<T, U, V>>); \ + return std::FUNC(x, y, z); \ + } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ + }; \ + using kk3_##FUNC = MathTernaryFunction_##FUNC; \ + template <> \ + struct math_function_name<MathTernaryFunction_##FUNC> { \ + static constexpr char name[] = #FUNC; \ + }; \ constexpr char math_function_name<MathTernaryFunction_##FUNC>::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2); DEFINE_TERNARY_FUNCTION_EVAL(fma, 2); #endif @@ -787,7 +782,9 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) { // TODO atan2 } +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 TEST(TEST_CATEGORY, mathematical_functions_power_functions) { TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11}); TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l}); @@ -1304,12 +1301,12 @@ struct TestAbsoluteValueFunction { if (abs(static_cast<KE::half_t>(4.f)) != static_cast<KE::half_t>(4.f) || abs(static_cast<KE::half_t>(-4.f)) != static_cast<KE::half_t>(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n"); + Kokkos::printf("failed abs(KE::half_t)\n"); } if (abs(static_cast<KE::bhalf_t>(4.f)) != static_cast<KE::bhalf_t>(4.f) || abs(static_cast<KE::bhalf_t>(-4.f)) != static_cast<KE::bhalf_t>(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n"); + Kokkos::printf("failed abs(KE::bhalf_t)\n"); } if (abs(5.) != 5. || abs(-5.) != 5.) { ++e; @@ -1329,19 +1326,17 @@ struct TestAbsoluteValueFunction { Kokkos::printf("failed abs(floating_point) special values\n"); } - static_assert(std::is_same<decltype(abs(1)), int>::value, ""); - static_assert(std::is_same<decltype(abs(2l)), long>::value, ""); - static_assert(std::is_same<decltype(abs(3ll)), long long>::value, ""); + static_assert(std::is_same_v<decltype(abs(1)), int>); + static_assert(std::is_same_v<decltype(abs(2l)), long>); + static_assert(std::is_same_v<decltype(abs(3ll)), long long>); static_assert(std::is_same<decltype(abs(static_cast<KE::half_t>(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same<decltype(abs(static_cast<KE::bhalf_t>(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same<decltype(abs(4.f)), float>::value, ""); - static_assert(std::is_same<decltype(abs(5.)), double>::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same_v<decltype(abs(4.f)), float>); + static_assert(std::is_same_v<decltype(abs(5.)), double>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same<decltype(abs(6.l)), long double>::value, ""); + static_assert(std::is_same_v<decltype(abs(6.l)), long double>); #endif } }; @@ -1362,26 +1357,26 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::fabs; if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n"); + Kokkos::printf("failed fabs(float)\n"); } if (fabs(static_cast<KE::half_t>(4.f)) != static_cast<KE::half_t>(4.f) || fabs(static_cast<KE::half_t>(-4.f)) != static_cast<KE::half_t>(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n"); + Kokkos::printf("failed fabs(KE::half_t)\n"); } if (fabs(static_cast<KE::bhalf_t>(4.f)) != static_cast<KE::bhalf_t>(4.f) || fabs(static_cast<KE::bhalf_t>(-4.f)) != static_cast<KE::bhalf_t>(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n"); + Kokkos::printf("failed fabs(KE::bhalf_t)\n"); } if (fabs(5.) != 5. || fabs(-5.) != 5.) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n"); + Kokkos::printf("failed fabs(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n"); + Kokkos::printf("failed fabs(long double)\n"); } #endif // special values @@ -1389,18 +1384,17 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::isnan; if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fabs(floating_point) special values\n"); + Kokkos::printf("failed fabs(floating_point) special values\n"); } static_assert(std::is_same<decltype(fabs(static_cast<KE::half_t>(4.f))), KE::half_t>::value); static_assert(std::is_same<decltype(fabs(static_cast<KE::bhalf_t>(4.f))), KE::bhalf_t>::value); - static_assert(std::is_same<decltype(fabs(4.f)), float>::value); - static_assert(std::is_same<decltype(fabs(5.)), double>::value); + static_assert(std::is_same_v<decltype(fabs(4.f)), float>); + static_assert(std::is_same_v<decltype(fabs(5.)), double>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same<decltype(fabs(6.l)), long double>::value); + static_assert(std::is_same_v<decltype(fabs(6.l)), long double>); #endif } }; @@ -1422,7 +1416,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(fmod(6.2f, 4.f), 2.2f, 1) && !compare(fmod(-6.2f, 4.f), -2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n"); + Kokkos::printf("failed fmod(float)\n"); } if (!compare( fmod(static_cast<KE::half_t>(6.2f), static_cast<KE::half_t>(4.f)), @@ -1431,7 +1425,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { fmod(static_cast<KE::half_t>(-6.2f), static_cast<KE::half_t>(4.f)), -static_cast<KE::half_t>(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n"); + Kokkos::printf("failed fmod(KE::half_t)\n"); } if (!compare( fmod(static_cast<KE::bhalf_t>(6.2f), static_cast<KE::bhalf_t>(4.f)), @@ -1440,17 +1434,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_cast<KE::bhalf_t>(4.f)), -static_cast<KE::bhalf_t>(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n"); + Kokkos::printf("failed fmod(KE::bhalf_t)\n"); } if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n"); + Kokkos::printf("failed fmod(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(fmod(6.2l, 4.l), 2.2l, 1) && !compare(fmod(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n"); + Kokkos::printf("failed fmod(long double)\n"); } #endif // special values @@ -1459,23 +1453,19 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(fmod(-KE::infinity<float>::value, 1.f)) && !isnan(fmod(-KE::quiet_NaN<float>::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fmod(floating_point) special values\n"); + Kokkos::printf("failed fmod(floating_point) special values\n"); } static_assert(std::is_same<decltype(fmod(static_cast<KE::half_t>(4.f), static_cast<KE::half_t>(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same<decltype(fmod(static_cast<KE::bhalf_t>(4.f), static_cast<KE::bhalf_t>(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same<decltype(fmod(4.f, 4.f)), float>::value, ""); - static_assert(std::is_same<decltype(fmod(5., 5.)), double>::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same_v<decltype(fmod(4.f, 4.f)), float>); + static_assert(std::is_same_v<decltype(fmod(5., 5.)), double>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same<decltype(fmod(6.l, 6.l)), long double>::value, - ""); + static_assert(std::is_same_v<decltype(fmod(6.l, 6.l)), long double>); #endif } }; @@ -1499,7 +1489,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(remainder(6.2f, 4.f), 2.2f, 2) && !compare(remainder(-6.2f, 4.f), 2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n"); + Kokkos::printf("failed remainder(float)\n"); } if (!compare(remainder(static_cast<KE::half_t>(6.2f), static_cast<KE::half_t>(4.f)), @@ -1508,7 +1498,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast<KE::half_t>(4.f)), -static_cast<KE::half_t>(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n"); + Kokkos::printf("failed remainder(KE::half_t)\n"); } if (!compare(remainder(static_cast<KE::bhalf_t>(6.2f), static_cast<KE::bhalf_t>(4.f)), @@ -1517,18 +1507,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast<KE::bhalf_t>(4.f)), -static_cast<KE::bhalf_t>(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n"); + Kokkos::printf("failed remainder(KE::bhalf_t)\n"); } if (!compare(remainder(6.2, 4.), 2.2, 2) && !compare(remainder(-6.2, 4.), 2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n"); + Kokkos::printf("failed remainder(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(remainder(6.2l, 4.l), 2.2l, 1) && !compare(remainder(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n"); + Kokkos::printf("failed remainder(long double)\n"); } #endif // special values @@ -1537,26 +1527,23 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(remainder(-KE::infinity<float>::value, 1.f)) && !isnan(remainder(-KE::quiet_NaN<float>::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( + Kokkos::printf( "failed remainder(floating_point) special values\n"); } static_assert( std::is_same<decltype(remainder(static_cast<KE::half_t>(4.f), static_cast<KE::half_t>(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert( std::is_same<decltype(remainder(static_cast<KE::bhalf_t>(4.f), static_cast<KE::bhalf_t>(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same<decltype(remainder(4.f, 4.f)), float>::value, - ""); - static_assert(std::is_same<decltype(remainder(5., 5.)), double>::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same_v<decltype(remainder(4.f, 4.f)), float>); + static_assert(std::is_same_v<decltype(remainder(5., 5.)), double>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS static_assert( - std::is_same<decltype(remainder(6.l, 6.l)), long double>::value, ""); + std::is_same_v<decltype(remainder(6.l, 6.l)), long double>); #endif } }; @@ -1568,6 +1555,7 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) { // TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 template <class Space> struct TestIsFinite { TestIsFinite() { run(); } @@ -1591,33 +1579,25 @@ struct TestIsFinite { ++e; Kokkos::printf("failed isfinite(float)\n"); } - if (!isfinite(static_cast<KE::half_t>(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isfinite(quiet_NaN<KE::half_t>::value) || +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) + if (!isfinite(static_cast<KE::half_t>(2.f)) || + isfinite(quiet_NaN<KE::half_t>::value) || isfinite(signaling_NaN<KE::half_t>::value) || - isfinite(infinity<KE::half_t>::value) -#endif - ) { + isfinite(infinity<KE::half_t>::value)) { ++e; Kokkos::printf("failed isfinite(KE::half_t)\n"); } - if (!isfinite(static_cast<KE::bhalf_t>(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isfinite(quiet_NaN<KE::bhalf_t>::value) || + if (!isfinite(static_cast<KE::bhalf_t>(2.f)) || + isfinite(quiet_NaN<KE::bhalf_t>::value) || isfinite(signaling_NaN<KE::bhalf_t>::value) || - isfinite(infinity<KE::bhalf_t>::value) -#endif - ) { + isfinite(infinity<KE::bhalf_t>::value)) { ++e; Kokkos::printf("failed isfinite(KE::bhalf_t)\n"); } - if (!isfinite(3.) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isfinite(quiet_NaN<double>::value) || - isfinite(signaling_NaN<double>::value) || - isfinite(infinity<double>::value) #endif - ) { + if (!isfinite(3.) || isfinite(quiet_NaN<double>::value) || + isfinite(signaling_NaN<double>::value) || + isfinite(infinity<double>::value)) { ++e; Kokkos::printf("failed isfinite(double)\n"); } @@ -1635,11 +1615,11 @@ struct TestIsFinite { Kokkos::printf("failed isfinite(floating_point) special values\n"); } - static_assert(std::is_same<decltype(isfinite(1)), bool>::value); - static_assert(std::is_same<decltype(isfinite(2.f)), bool>::value); - static_assert(std::is_same<decltype(isfinite(3.)), bool>::value); + static_assert(std::is_same_v<decltype(isfinite(1)), bool>); + static_assert(std::is_same_v<decltype(isfinite(2.f)), bool>); + static_assert(std::is_same_v<decltype(isfinite(3.)), bool>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same<decltype(isfinite(4.l)), bool>::value); + static_assert(std::is_same_v<decltype(isfinite(4.l)), bool>); #endif } }; @@ -1670,32 +1650,25 @@ struct TestIsInf { ++e; Kokkos::printf("failed isinf(float)\n"); } - if (isinf(static_cast<KE::half_t>(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isinf(quiet_NaN<KE::half_t>::value) || +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) + if (isinf(static_cast<KE::half_t>(2.f)) || + isinf(quiet_NaN<KE::half_t>::value) || isinf(signaling_NaN<KE::half_t>::value) || - !isinf(infinity<KE::half_t>::value) -#endif - ) { + !isinf(infinity<KE::half_t>::value)) { ++e; Kokkos::printf("failed isinf(KE::half_t)\n"); } - if (isinf(static_cast<KE::bhalf_t>(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isinf(quiet_NaN<KE::bhalf_t>::value) || + if (isinf(static_cast<KE::bhalf_t>(2.f)) || + isinf(quiet_NaN<KE::bhalf_t>::value) || isinf(signaling_NaN<KE::bhalf_t>::value) || - !isinf(infinity<KE::bhalf_t>::value) -#endif - ) { + !isinf(infinity<KE::bhalf_t>::value)) { ++e; Kokkos::printf("failed isinf(KE::bhalf_t)\n"); } - if (isinf(3.) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || isinf(quiet_NaN<double>::value) || - isinf(signaling_NaN<double>::value) || !isinf(infinity<double>::value) #endif - ) { + if (isinf(3.) || isinf(quiet_NaN<double>::value) || + isinf(signaling_NaN<double>::value) || + !isinf(infinity<double>::value)) { ++e; Kokkos::printf("failed isinf(double)\n"); } @@ -1713,11 +1686,11 @@ struct TestIsInf { Kokkos::printf("failed isinf(floating_point) special values\n"); } - static_assert(std::is_same<decltype(isinf(1)), bool>::value); - static_assert(std::is_same<decltype(isinf(2.f)), bool>::value); - static_assert(std::is_same<decltype(isinf(3.)), bool>::value); + static_assert(std::is_same_v<decltype(isinf(1)), bool>); + static_assert(std::is_same_v<decltype(isinf(2.f)), bool>); + static_assert(std::is_same_v<decltype(isinf(3.)), bool>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same<decltype(isinf(4.l)), bool>::value); + static_assert(std::is_same_v<decltype(isinf(4.l)), bool>); #endif } }; @@ -1748,35 +1721,28 @@ struct TestIsNaN { ++e; Kokkos::printf("failed isnan(float)\n"); } - if (isnan(static_cast<KE::half_t>(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || !isnan(quiet_NaN<KE::half_t>::value) || +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) + if (isnan(static_cast<KE::half_t>(2.f)) || + !isnan(quiet_NaN<KE::half_t>::value) || !isnan(signaling_NaN<KE::half_t>::value) || - isnan(infinity<KE::half_t>::value) -#endif - ) { + isnan(infinity<KE::half_t>::value)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); + Kokkos::printf("failed isnan(KE::half_t)\n"); } - if (isnan(static_cast<KE::bhalf_t>(2.f)) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || !isnan(quiet_NaN<KE::bhalf_t>::value) || + if (isnan(static_cast<KE::bhalf_t>(2.f)) || + !isnan(quiet_NaN<KE::bhalf_t>::value) || !isnan(signaling_NaN<KE::bhalf_t>::value) || - isnan(infinity<KE::bhalf_t>::value) -#endif - ) { + isnan(infinity<KE::bhalf_t>::value)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); + Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } - if (isnan(3.) -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 - || !isnan(quiet_NaN<double>::value) || - !isnan(signaling_NaN<double>::value) || isnan(infinity<double>::value) -#endif - ) { + if (isnan(3.) || !isnan(quiet_NaN<double>::value) || + !isnan(signaling_NaN<double>::value) || + isnan(infinity<double>::value)) { ++e; Kokkos::printf("failed isnan(double)\n"); } +#endif #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (isnan(4.l) || !isnan(quiet_NaN<long double>::value) || !isnan(signaling_NaN<long double>::value) || @@ -1791,11 +1757,11 @@ struct TestIsNaN { Kokkos::printf("failed isnan(floating_point) special values\n"); } - static_assert(std::is_same<decltype(isnan(1)), bool>::value, ""); - static_assert(std::is_same<decltype(isnan(2.f)), bool>::value, ""); - static_assert(std::is_same<decltype(isnan(3.)), bool>::value, ""); + static_assert(std::is_same_v<decltype(isnan(1)), bool>); + static_assert(std::is_same_v<decltype(isnan(2.f)), bool>); + static_assert(std::is_same_v<decltype(isnan(3.)), bool>); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same<decltype(isnan(4.l)), bool>::value, ""); + static_assert(std::is_same_v<decltype(isnan(4.l)), bool>); #endif } }; @@ -1803,6 +1769,7 @@ struct TestIsNaN { TEST(TEST_CATEGORY, mathematical_functions_isnan) { TestIsNaN<TEST_EXECSPACE>(); } +#endif // TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279 #endif diff --git a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp index 06c84c751370d7c4b6a703047b2e1b834fa1934a..23974bb4137fc747d22f0ff6bb3256d1df1e4c6d 100644 --- a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1213,13 +1213,12 @@ struct TestComplexBesselI0K0Function { } EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0)); - int upper_limit = N; + int upper_limit_0 = N; // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) - upper_limit = 19; + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) upper_limit_0 = 19; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_0; i++) { EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)), Kokkos::abs(h_ref_cbk0(i)) * 1e-13) << "at index " << i; @@ -1462,13 +1461,12 @@ struct TestComplexBesselI1K1Function { } EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0)); - int upper_limit = N; + int upper_limit_1 = N; // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) - upper_limit = 8; + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) upper_limit_1 = 8; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_1; i++) { EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)), Kokkos::abs(h_ref_cbk1(i)) * 1e-13) << "at index " << i; @@ -1712,26 +1710,31 @@ struct TestComplexBesselH1Function { h_ref_ch11(24) = Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02); - // FIXME_HIP Disable the test when using ROCm 5.5 and 5.6 due to a known - // compiler bug -#if !defined(KOKKOS_ENABLE_HIP) || (HIP_VERSION_MAJOR != 5) || \ - ((HIP_VERSION_MAJOR == 5) && \ - !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) + // FIXME_HIP Disable the test when using ROCm 5.5, 5.6, and 6.2 due to a + // known compiler bug +#if !(defined(KOKKOS_ENABLE_HIP) && \ + ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR == 5) || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR == 6) || \ + (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2))) EXPECT_EQ(h_ref_ch10(0), h_ch10(0)); - for (int i = 1; i < N; i++) { + int upper_limit_10 = N; +// FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) upper_limit_10 = 17; +#endif + for (int i = 1; i < upper_limit_10; i++) { EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)), Kokkos::abs(h_ref_ch10(i)) * 1e-13) << "at index " << i; } EXPECT_EQ(h_ref_ch11(0), h_ch11(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case + int upper_limit_11 = N; + // FIXME_SYCL Failing for Intel GPUs, 2 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) - upper_limit = 16; + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) upper_limit_11 = 2; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_11; i++) { EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)), Kokkos::abs(h_ref_ch11(i)) * 1e-13) << "at index " << i; @@ -1906,25 +1909,31 @@ struct TestComplexBesselH2Function { h_ref_ch21(24) = Kokkos::complex<double>(1.629136145471347e-01, +1.530182458039000e-02); - // FIXME_HIP Disable the test when using ROCm 5.5 and 5.6 due to a known - // compiler bug -#if !defined(KOKKOS_ENABLE_HIP) || (HIP_VERSION_MAJOR != 5) || \ - ((HIP_VERSION_MAJOR == 5) && \ - !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) + // FIXME_HIP Disable the test when using ROCm 5.5, 5.6, and 6.2 due to a + // known compiler bug +#if !(defined(KOKKOS_ENABLE_HIP) || \ + ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR == 5) || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR == 6) || \ + (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2))) EXPECT_EQ(h_ref_ch20(0), h_ch20(0)); - for (int i = 1; i < N; i++) { + int upper_limit_20 = N; +// FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) upper_limit_20 = 16; +#endif + for (int i = 1; i < upper_limit_20; i++) { EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)), - Kokkos::abs(h_ref_ch20(i)) * 1e-13); + Kokkos::abs(h_ref_ch20(i)) * 1e-13) + << "at index " << i; } EXPECT_EQ(h_ref_ch21(0), h_ch21(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case + int upper_limit_21 = N; + // FIXME_SYCL Failing for Intel GPUs, 1 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) - upper_limit = 17; + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) upper_limit_21 = 1; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_21; i++) { EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)), Kokkos::abs(h_ref_ch21(i)) * 1e-13) << "at index " << i; @@ -1954,31 +1963,61 @@ TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) { #endif TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ0Y0Function<TEST_EXECSPACE> test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ1Y1Function<TEST_EXECSPACE> test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI0K0Function<TEST_EXECSPACE> test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI1K1Function<TEST_EXECSPACE> test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH1Function<TEST_EXECSPACE> test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH2Function<TEST_EXECSPACE> test; test.testit(); } diff --git a/packages/kokkos/core/unit_test/TestMemoryPool.hpp b/packages/kokkos/core/unit_test/TestMemoryPool.hpp index f0bf1182924c9e2ac57fea092d3f1c5b7d4eac18..c992458307a95b71c62d5ea7e0e0b4a66b10dfcd 100644 --- a/packages/kokkos/core/unit_test/TestMemoryPool.hpp +++ b/packages/kokkos/core/unit_test/TestMemoryPool.hpp @@ -455,9 +455,8 @@ struct TestMemoryPoolHuge { template <class DeviceType> struct TestMemoryPoolHuge< - DeviceType, - std::enable_if_t<std::is_same<Kokkos::HostSpace, - typename DeviceType::memory_space>::value>> { + DeviceType, std::enable_if_t<std::is_same_v< + Kokkos::HostSpace, typename DeviceType::memory_space>>> { using ptrs_type = Kokkos::View<uintptr_t*, DeviceType>; using pool_type = Kokkos::MemoryPool<DeviceType>; using memory_space = typename DeviceType::memory_space; diff --git a/packages/kokkos/core/unit_test/TestMultiGPU.hpp b/packages/kokkos/core/unit_test/TestMultiGPU.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aad2fa45f4921c9481b3e1a2fd10db54e8ac9eba --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMultiGPU.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Test_InterOp_Streams.hpp> + +namespace { + +void test_policies(TEST_EXECSPACE exec0, Kokkos::View<int *, TEST_EXECSPACE> v0, + TEST_EXECSPACE exec, Kokkos::View<int *, TEST_EXECSPACE> v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + exec.fence(); + exec0.fence(); + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::Range_0", + Kokkos::RangePolicy<TEST_EXECSPACE>(exec0, 0, 100), + Test::FunctorRange<MemorySpace>(v0)); + Kokkos::parallel_for("Test::Range", + Kokkos::RangePolicy<TEST_EXECSPACE>(exec, 0, 100), + Test::FunctorRange<MemorySpace>(v)); + exec.fence(); + exec0.fence(); + Kokkos::parallel_reduce( + "Test::RangeReduce_0", + Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0, + 0, 100), + Test::FunctorRangeReduce<MemorySpace>(v0), sum0); + Kokkos::parallel_reduce( + "Test::RangeReduce", + Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 0, + 100), + Test::FunctorRangeReduce<MemorySpace>(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::MDRange_0", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange<MemorySpace>(v0)); + Kokkos::parallel_for("Test::MDRange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange<MemorySpace>(v)); + Kokkos::parallel_reduce("Test::MDRangeReduce_0", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce<MemorySpace>(v0), sum0); + Kokkos::parallel_reduce("Test::MDRangeReduce", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce<MemorySpace>(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::Team_0", + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec0, 10, 10), + Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v0)); + Kokkos::parallel_for("Test::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec, 10, 10), + Test::FunctorTeam<MemorySpace, TEST_EXECSPACE>(v)); + Kokkos::parallel_reduce( + "Test::Team_0", + Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec0, + 10, 10), + Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v0), sum0); + Kokkos::parallel_reduce( + "Test::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>(exec, 10, + 10), + Test::FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type &team, + int &error_accum) const { + Kokkos::View<int *, TEST_EXECSPACE::scratch_memory_space> scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View<int *, TEST_EXECSPACE::scratch_memory_space>; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy<TEST_EXECSPACE>(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} +} // namespace diff --git a/packages/kokkos/core/unit_test/TestNestedReducerCTAD.cpp b/packages/kokkos/core/unit_test/TestNestedReducerCTAD.cpp new file mode 100644 index 0000000000000000000000000000000000000000..927aa21c16e861d2fcd0dd92ad187df71e585eb3 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestNestedReducerCTAD.cpp @@ -0,0 +1,246 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +namespace { + +struct TestNestedReducerCTAD { + using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space; + using ScalarType = int; + using IndexType = int; + using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>; + using TeamHandle = TeamPolicy::member_type; + + struct FakeComparator { + template <class T> + KOKKOS_FUNCTION bool operator()(T const&, T const&) const { + return true; + } + }; + + template <class ValueType> + struct FakeFunctor { + KOKKOS_FUNCTION void operator()(int, ValueType&) const {} + }; + + template <class ReducerTypeExpected, class ReducerTypeToCheck> + KOKKOS_FUNCTION static void check_types( + [[maybe_unused]] ReducerTypeToCheck const& reducer) { + static_assert(std::is_same_v<ReducerTypeExpected, ReducerTypeToCheck>); + } + + KOKKOS_FUNCTION void operator()( + [[maybe_unused]] TeamHandle const& team_handle) const { + { + using ReducerTypeExpected = Kokkos::Sum<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::Sum reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::Prod<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::Prod reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::Min<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::Min reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::Max<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::Max reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::LAnd<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::LAnd reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::LOr<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::LOr reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::BAnd<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::BAnd reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::BOr<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::BOr reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinLoc<ScalarType, IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MinLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MaxLoc<ScalarType, IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MaxLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::MinMax<ScalarType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MinMax reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinMaxLoc<ScalarType, IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MinMaxLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MaxFirstLoc<ScalarType, IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MaxFirstLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MaxFirstLocCustomComparator<ScalarType, IndexType, + FakeComparator, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + FakeComparator comparator; + Kokkos::MaxFirstLocCustomComparator reducer(view, comparator); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinFirstLoc<ScalarType, IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MinFirstLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinFirstLocCustomComparator<ScalarType, IndexType, + FakeComparator, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + FakeComparator comparator; + Kokkos::MinFirstLocCustomComparator reducer(view, comparator); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::MinMaxFirstLastLoc<ScalarType, IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::MinMaxFirstLastLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::MinMaxFirstLastLocCustomComparator< + ScalarType, IndexType, FakeComparator, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + FakeComparator comparator; + Kokkos::MinMaxFirstLastLocCustomComparator reducer(view, comparator); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::FirstLoc<IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::FirstLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = Kokkos::LastLoc<IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::LastLoc reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::StdIsPartitioned<IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::StdIsPartitioned reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + + { + using ReducerTypeExpected = + Kokkos::StdPartitionPoint<IndexType, MemorySpace>; + using ValueType = ReducerTypeExpected::value_type; + Kokkos::View<ValueType, MemorySpace> view; + Kokkos::StdPartitionPoint reducer(view); + check_types<ReducerTypeExpected>(reducer); + } + } + + TestNestedReducerCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp index eaf7a4125cc3ca23f4b26b60a6a602a7d0cf7a89..116ac58c39ff2f2b093d14ddee4ffb8f85c12c0a 100644 --- a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp +++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp @@ -214,7 +214,7 @@ struct point_t { uint8_t x, y, z; KOKKOS_FUNCTION - point_t() : x(1), y(1), z(1){}; + point_t() : x(0), y(0), z(0){}; KOKKOS_FUNCTION point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){}; diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp index 2b5531f29a6a8d7c3fc10db59967fb9d1479832c..2cdc6515f633db66537be630e457771c498b4acc 100644 --- a/packages/kokkos/core/unit_test/TestNumericTraits.hpp +++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp @@ -14,12 +14,28 @@ // //@HEADER +#include <Kokkos_Macros.hpp> + +// Suppress "'long double' is treated as 'double' in device code" +// The suppression needs to happen before Kokkos_NumericTraits.hpp is included +// to be effective +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 20208 +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic push +#pragma diag_suppress 3245 +#endif +#endif +#endif + #include <gtest/gtest.h> #include <Kokkos_Core.hpp> #include <type_traits> #include <limits> -#include "Kokkos_NumericTraits.hpp" struct extrema { #define DEFINE_EXTREMA(T, m, M) \ @@ -110,8 +126,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const { using Kokkos::Experimental::epsilon; - auto const eps = epsilon<T>::value; - auto const one = T(1); + T const eps = epsilon<T>::value; + T const one = 1; // Avoid higher precision intermediate representation compare() = one + eps; e += (int)!(compare() != one); @@ -145,33 +161,25 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(MaxExponent10, int, int&) const { use_on_device(); } // clang-format on KOKKOS_FUNCTION void operator()(QuietNaN, int, int& e) const { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::quiet_NaN; constexpr auto nan = quiet_NaN<T>::value; auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); -#else - (void)e; -#endif use_on_device(); } KOKKOS_FUNCTION void operator()(SignalingNaN, int, int& e) const { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 nan using Kokkos::Experimental::signaling_NaN; constexpr auto nan = signaling_NaN<T>::value; auto const zero = T(0); e += (int)!(nan != nan); e += (int)!(nan != zero); -#else - (void)e; -#endif use_on_device(); } KOKKOS_FUNCTION void use_on_device() const { -#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC) || \ - defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC) +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ + defined(KOKKOS_ENABLE_OPENACC) take_by_value(trait<T>::value); #else (void)take_address_of(trait<T>::value); @@ -186,7 +194,7 @@ struct TestNumericTraits< #if defined(KOKKOS_ENABLE_CUDA) Kokkos::Cuda, #elif defined(KOKKOS_ENABLE_SYCL) - Kokkos::Experimental::SYCL, + Kokkos::SYCL, #else Kokkos::Experimental::OpenMPTarget, #endif @@ -204,58 +212,46 @@ struct TestNumericTraits< #endif TEST(TEST_CATEGORY, numeric_traits_infinity) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Infinity>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Infinity>(); -#endif TestNumericTraits<TEST_EXECSPACE, float, Infinity>(); TestNumericTraits<TEST_EXECSPACE, double, Infinity>(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, Infinity>(); #endif } TEST(TEST_CATEGORY, numeric_traits_epsilon) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, Epsilon>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Epsilon>(); -#endif TestNumericTraits<TEST_EXECSPACE, float, Epsilon>(); TestNumericTraits<TEST_EXECSPACE, double, Epsilon>(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, Epsilon>(); #endif } TEST(TEST_CATEGORY, numeric_traits_round_error) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, RoundError>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, RoundError>(); -#endif TestNumericTraits<TEST_EXECSPACE, float, RoundError>(); TestNumericTraits<TEST_EXECSPACE, double, RoundError>(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, RoundError>(); #endif } TEST(TEST_CATEGORY, numeric_traits_norm_min) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 bit_comparison_type TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, NormMin>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, NormMin>(); -#endif TestNumericTraits<TEST_EXECSPACE, float, NormMin>(); TestNumericTraits<TEST_EXECSPACE, double, NormMin>(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, NormMin>(); #endif } @@ -263,9 +259,8 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { TEST(TEST_CATEGORY, numeric_traits_denorm_min) { TestNumericTraits<TEST_EXECSPACE, float, DenormMin>(); TestNumericTraits<TEST_EXECSPACE, double, DenormMin>(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, DenormMin>(); #endif } @@ -302,8 +297,8 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { TestNumericTraits<TEST_EXECSPACE, float, FiniteMax>(); TestNumericTraits<TEST_EXECSPACE, double, FiniteMin>(); TestNumericTraits<TEST_EXECSPACE, double, FiniteMax>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, FiniteMin>(); TestNumericTraits<TEST_EXECSPACE, long double, FiniteMax>(); #endif @@ -326,8 +321,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Digits>(); TestNumericTraits<TEST_EXECSPACE, float, Digits>(); TestNumericTraits<TEST_EXECSPACE, double, Digits>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, Digits>(); #endif } @@ -349,8 +344,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Digits10>(); TestNumericTraits<TEST_EXECSPACE, float, Digits10>(); TestNumericTraits<TEST_EXECSPACE, double, Digits10>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, Digits10>(); #endif } @@ -358,8 +353,8 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TEST(TEST_CATEGORY, numeric_traits_max_digits10) { TestNumericTraits<TEST_EXECSPACE, float, MaxDigits10>(); TestNumericTraits<TEST_EXECSPACE, double, MaxDigits10>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, MaxDigits10>(); #endif } @@ -380,8 +375,8 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, Radix>(); TestNumericTraits<TEST_EXECSPACE, float, Radix>(); TestNumericTraits<TEST_EXECSPACE, double, Radix>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, Radix>(); #endif } @@ -395,8 +390,8 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { TestNumericTraits<TEST_EXECSPACE, float, MaxExponent>(); TestNumericTraits<TEST_EXECSPACE, double, MinExponent>(); TestNumericTraits<TEST_EXECSPACE, double, MaxExponent>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, MinExponent>(); TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent>(); #endif @@ -407,31 +402,36 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { TestNumericTraits<TEST_EXECSPACE, float, MaxExponent10>(); TestNumericTraits<TEST_EXECSPACE, double, MinExponent10>(); TestNumericTraits<TEST_EXECSPACE, double, MaxExponent10>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>(); TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>(); #endif } + +KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { -#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 +#ifdef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC + GTEST_SKIP() << "This test is known to fail with the NVHPC compiler"; +#endif + TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, QuietNaN>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::half_t, SignalingNaN>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, QuietNaN>(); TestNumericTraits<TEST_EXECSPACE, Kokkos::Experimental::bhalf_t, SignalingNaN>(); -#endif TestNumericTraits<TEST_EXECSPACE, float, QuietNaN>(); TestNumericTraits<TEST_EXECSPACE, float, SignalingNaN>(); TestNumericTraits<TEST_EXECSPACE, double, QuietNaN>(); TestNumericTraits<TEST_EXECSPACE, double, SignalingNaN>(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits<TEST_EXECSPACE, long double, QuietNaN>(); TestNumericTraits<TEST_EXECSPACE, long double, SignalingNaN>(); #endif } +KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() namespace NumericTraitsSFINAE { @@ -442,7 +442,7 @@ struct HasNoSpecialization {}; using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT<T>::value); \ template <class T> \ using has_##TRAIT = Kokkos::is_detected<TRAIT##_value_t, T>; \ - static_assert(!has_##TRAIT<HasNoSpecialization>::value, ""); + static_assert(!has_##TRAIT<HasNoSpecialization>::value); CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity) CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min) @@ -524,39 +524,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif // clang-format off -static_assert(Kokkos::Experimental::norm_min<float >::value == std::numeric_limits< float>::min(), ""); -static_assert(Kokkos::Experimental::norm_min<double >::value == std::numeric_limits< double>::min(), ""); -static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min(), ""); +static_assert(Kokkos::Experimental::norm_min<float >::value == std::numeric_limits< float>::min()); +static_assert(Kokkos::Experimental::norm_min<double >::value == std::numeric_limits< double>::min()); +static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min()); // integer types -static_assert(Kokkos::Experimental::finite_min<char >::value == std::numeric_limits< char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<signed char >::value == std::numeric_limits< signed char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<unsigned char >::value == std::numeric_limits< unsigned char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<short >::value == std::numeric_limits< short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<unsigned short >::value == std::numeric_limits< unsigned short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<int >::value == std::numeric_limits< int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<unsigned int >::value == std::numeric_limits< unsigned int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<long int >::value == std::numeric_limits< long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<unsigned long int >::value == std::numeric_limits< unsigned long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<long long int >::value == std::numeric_limits< long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_max<char >::value == std::numeric_limits< char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<signed char >::value == std::numeric_limits< signed char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<unsigned char >::value == std::numeric_limits< unsigned char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<short >::value == std::numeric_limits< short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<unsigned short >::value == std::numeric_limits< unsigned short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<int >::value == std::numeric_limits< int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<unsigned int >::value == std::numeric_limits< unsigned int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<long int >::value == std::numeric_limits< long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<unsigned long int >::value == std::numeric_limits< unsigned long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<long long int >::value == std::numeric_limits< long long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max(), ""); +static_assert(Kokkos::Experimental::finite_min<char >::value == std::numeric_limits< char>::min()); +static_assert(Kokkos::Experimental::finite_min<signed char >::value == std::numeric_limits< signed char>::min()); +static_assert(Kokkos::Experimental::finite_min<unsigned char >::value == std::numeric_limits< unsigned char>::min()); +static_assert(Kokkos::Experimental::finite_min<short >::value == std::numeric_limits< short>::min()); +static_assert(Kokkos::Experimental::finite_min<unsigned short >::value == std::numeric_limits< unsigned short>::min()); +static_assert(Kokkos::Experimental::finite_min<int >::value == std::numeric_limits< int>::min()); +static_assert(Kokkos::Experimental::finite_min<unsigned int >::value == std::numeric_limits< unsigned int>::min()); +static_assert(Kokkos::Experimental::finite_min<long int >::value == std::numeric_limits< long int>::min()); +static_assert(Kokkos::Experimental::finite_min<unsigned long int >::value == std::numeric_limits< unsigned long int>::min()); +static_assert(Kokkos::Experimental::finite_min<long long int >::value == std::numeric_limits< long long int>::min()); +static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min()); +static_assert(Kokkos::Experimental::finite_max<char >::value == std::numeric_limits< char>::max()); +static_assert(Kokkos::Experimental::finite_max<signed char >::value == std::numeric_limits< signed char>::max()); +static_assert(Kokkos::Experimental::finite_max<unsigned char >::value == std::numeric_limits< unsigned char>::max()); +static_assert(Kokkos::Experimental::finite_max<short >::value == std::numeric_limits< short>::max()); +static_assert(Kokkos::Experimental::finite_max<unsigned short >::value == std::numeric_limits< unsigned short>::max()); +static_assert(Kokkos::Experimental::finite_max<int >::value == std::numeric_limits< int>::max()); +static_assert(Kokkos::Experimental::finite_max<unsigned int >::value == std::numeric_limits< unsigned int>::max()); +static_assert(Kokkos::Experimental::finite_max<long int >::value == std::numeric_limits< long int>::max()); +static_assert(Kokkos::Experimental::finite_max<unsigned long int >::value == std::numeric_limits< unsigned long int>::max()); +static_assert(Kokkos::Experimental::finite_max<long long int >::value == std::numeric_limits< long long int>::max()); +static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max()); // floating point types -static_assert(Kokkos::Experimental::finite_min<float >::value == -std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_min<double >::value == -std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<float >::value == std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<double >::value == std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max<long double>::value == std::numeric_limits<long double>::max(), ""); +static_assert(Kokkos::Experimental::finite_min<float >::value == -std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_min<double >::value == -std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max()); +static_assert(Kokkos::Experimental::finite_max<float >::value == std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_max<double >::value == std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_max<long double>::value == std::numeric_limits<long double>::max()); // clang-format on CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits); @@ -623,15 +623,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10); #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT -#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ - static_assert(Kokkos::Experimental::TRAIT<T>::value != \ - Kokkos::Experimental::TRAIT<T>::value, \ - ""); \ - static_assert( \ - std::numeric_limits<T>::TRAIT() != std::numeric_limits<T>::TRAIT(), ""); \ - static_assert(Kokkos::Experimental::TRAIT<T>::value != \ - std::numeric_limits<T>::TRAIT(), \ - "") +#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ + static_assert(Kokkos::Experimental::TRAIT<T>::value != \ + Kokkos::Experimental::TRAIT<T>::value); \ + static_assert(std::numeric_limits<T>::TRAIT() != \ + std::numeric_limits<T>::TRAIT()); \ + static_assert(Kokkos::Experimental::TRAIT<T>::value != \ + std::numeric_limits<T>::TRAIT()) // Workaround compiler issue error: expression must have a constant value // See kokkos/kokkos#4574 @@ -651,14 +649,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN); #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT<T const>::value == \ - Kokkos::Experimental::TRAIT<T>::value, \ - ""); \ + Kokkos::Experimental::TRAIT<T>::value); \ static_assert(Kokkos::Experimental::TRAIT<T volatile>::value == \ - Kokkos::Experimental::TRAIT<T>::value, \ - ""); \ + Kokkos::Experimental::TRAIT<T>::value); \ static_assert(Kokkos::Experimental::TRAIT<T const volatile>::value == \ - Kokkos::Experimental::TRAIT<T>::value, \ - "") + Kokkos::Experimental::TRAIT<T>::value) #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -706,17 +701,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10); #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT<T>::value != \ - Kokkos::Experimental::TRAIT<T>::value, \ - ""); \ + Kokkos::Experimental::TRAIT<T>::value); \ static_assert(Kokkos::Experimental::TRAIT<T const>::value != \ - Kokkos::Experimental::TRAIT<T>::value, \ - ""); \ + Kokkos::Experimental::TRAIT<T>::value); \ static_assert(Kokkos::Experimental::TRAIT<T volatile>::value != \ - Kokkos::Experimental::TRAIT<T>::value, \ - ""); \ + Kokkos::Experimental::TRAIT<T>::value); \ static_assert(Kokkos::Experimental::TRAIT<T const volatile>::value != \ - Kokkos::Experimental::TRAIT<T>::value, \ - "") + Kokkos::Experimental::TRAIT<T>::value) #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -728,3 +719,13 @@ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(signaling_NaN); #undef CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT #undef CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES + +#ifdef KOKKOS_COMPILER_NVCC +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic pop +#endif +#endif +#endif diff --git a/packages/kokkos/core/unit_test/TestOccupancyControlTrait.hpp b/packages/kokkos/core/unit_test/TestOccupancyControlTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..345a906d6683ddb8956905ff3acc9856ce5d408b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestOccupancyControlTrait.hpp @@ -0,0 +1,80 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +namespace { + +template <class... Properties> +void test_policy_execution(const Kokkos::RangePolicy<Properties...>& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); +} +template <class... Properties> +void test_policy_execution(const Kokkos::TeamPolicy<Properties...>& policy) { + Kokkos::parallel_for( + policy, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy<Properties...>::member_type&){}); +} +template <class... Properties> +void test_policy_execution(const Kokkos::MDRangePolicy<Properties...>& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); +} + +template <class Policy> +void test_prefer_desired_occupancy(Policy policy) { + using Kokkos::Experimental::DesiredOccupancy; + using Kokkos::Experimental::MaximizeOccupancy; + using Kokkos::Experimental::prefer; + using Kokkos::Experimental::WorkItemProperty; + + // MaximizeOccupancy -> MaximizeOccupancy + auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); + test_policy_execution(policy_still_no_occ); + + // MaximizeOccupancy -> DesiredOccupancy + auto const policy_with_occ = + prefer(policy_still_no_occ, DesiredOccupancy{33}); + test_policy_execution(policy_with_occ); + + // DesiredOccupancy -> DesiredOccupancy + auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); + test_policy_execution(policy_change_occ); + + // DesiredOccupancy -> DesiredOccupancy w/ hint + auto policy_with_occ_and_hint = Kokkos::Experimental::require( + policy_change_occ, + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + test_policy_execution(policy_with_occ_and_hint); + + // DesiredOccupancy -> MaximizeOccupancy + auto const policy_drop_occ = + prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); + test_policy_execution(policy_drop_occ); +} + +// FIXME_MSVC_WITH_CUDA +// This test doesn't compile with CUDA on Windows +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) +TEST(TEST_CATEGORY, occupancy_control) { + test_prefer_desired_occupancy(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1)); + test_prefer_desired_occupancy( + Kokkos::TeamPolicy<TEST_EXECSPACE>{1, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{0, 0}, {1, 1}}); +} +#endif +} // namespace diff --git a/packages/kokkos/core/unit_test/TestOther.hpp b/packages/kokkos/core/unit_test/TestOther.hpp index fcf0353a88ca803c77cd992e4e6c70f44c177ac9..9daef3ca3f394db117c62adcd40d2db639781ca6 100644 --- a/packages/kokkos/core/unit_test/TestOther.hpp +++ b/packages/kokkos/core/unit_test/TestOther.hpp @@ -16,13 +16,8 @@ #ifndef KOKKOS_TEST_OTHER_HPP #define KOKKOS_TEST_OTHER_HPP -#include <TestAggregate.hpp> #include <TestMemoryPool.hpp> #include <TestCXX11.hpp> #include <TestViewCtorPropEmbeddedDim.hpp> -// with VS 16.11.3 and CUDA 11.4.2 getting cudafe stackoverflow crash -#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) -#include <TestViewLayoutTiled.hpp> -#endif #endif diff --git a/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp index 176ce9b5fed40ebe925cd65b0fc1cb3ecdce738f..81ba8c6e2df57abf27110bdc4adf3c9dc36702b4 100644 --- a/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp +++ b/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp @@ -92,7 +92,7 @@ class EnvVarsHelper { teardown(); mutex_.unlock(); } - EnvVarsHelper(EnvVarsHelper&) = delete; + EnvVarsHelper(EnvVarsHelper&) = delete; EnvVarsHelper& operator=(EnvVarsHelper&) = delete; friend std::ostream& operator<<(std::ostream& os, EnvVarsHelper const& ev) { for (auto const& name : ev.vars_) { @@ -166,22 +166,6 @@ TEST(defaultdevicetype, cmd_line_args_device_id) { EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"}); } -TEST(defaultdevicetype, cmd_line_args_num_devices) { - CmdLineArgsHelper cla = {{ - "--kokkos-num-devices=5,6", - "--kokkos-num-devices=7", - "-v", - }}; - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 7); - // this is the current behavior, not suggesting this cannot be revisited - EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment"; - EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment"; - EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"}); -} - TEST(defaultdevicetype, cmd_line_args_disable_warning) { CmdLineArgsHelper cla = {{ "--kokkos-disable-warnings=1", @@ -351,20 +335,6 @@ TEST(defaultdevicetype, env_vars_device_id) { EXPECT_EQ(settings.get_device_id(), 33); } -TEST(defaultdevicetype, env_vars_num_devices) { - EnvVarsHelper ev = {{ - {"KOKKOS_NUM_DEVICES", "4"}, - {"KOKKOS_SKIP_DEVICE", "1"}, - }}; - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_environment_variables(settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 4); - EXPECT_TRUE(settings.has_skip_device()); - EXPECT_EQ(settings.get_skip_device(), 1); -} - TEST(defaultdevicetype, env_vars_disable_warnings) { for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) { EnvVarsHelper ev = {{ @@ -420,30 +390,31 @@ TEST(defaultdevicetype, env_vars_tune_internals) { } TEST(defaultdevicetype, visible_devices) { -#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ - do { \ - EnvVarsHelper ev{ENV}; \ - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ - Kokkos::InitializationSettings settings; \ - Kokkos::Impl::parse_environment_variables(settings); \ - auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \ - std::vector<int> expected = DEV; \ - EXPECT_EQ(expected.size(), computed.size()) \ - << ev << "device count: " << CNT; \ - auto n = std::min<int>(expected.size(), computed.size()); \ - for (int i = 0; i < n; ++i) { \ - EXPECT_EQ(expected[i], computed[i]) \ - << "devices differ at index " << i << '\n' \ - << ev << "device count: " << CNT; \ - } \ +#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ + do { \ + EnvVarsHelper ev{ENV}; \ + SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ + auto computed = Kokkos::Impl::get_visible_devices(CNT); \ + std::vector<int> expected = DEV; \ + EXPECT_EQ(expected.size(), computed.size()) \ + << ev << "device count: " << CNT; \ + auto n = std::min<int>(expected.size(), computed.size()); \ + for (int i = 0; i < n; ++i) { \ + EXPECT_EQ(expected[i], computed[i]) \ + << "devices differ at index " << i << '\n' \ + << ev << "device count: " << CNT; \ + } \ } while (false) #define DEV(...) \ std::vector<int> { __VA_ARGS__ } -#define ENV(...) std::unordered_map<std::string, std::string>{__VA_ARGS__} +#define ENV(...) \ + std::unordered_map<std::string, std::string> { __VA_ARGS__ } // first test with all environment variables that are involved in determining // the visible devices so user set var do not mess up the logic below. + // KOKKOS_NUM_DEVICES and KOKKOS_SKIP_DEVICE are deprecated since 3.7 and are + // not taken into account anymore. KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, {"KOKKOS_SKIP_DEVICE", "1"}), @@ -452,10 +423,10 @@ TEST(defaultdevicetype, visible_devices) { ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6, DEV(2, 1)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6, - DEV(0, 1, 2)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6, - DEV(0, 2, 3)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6, DEV(1, 3, 4)); KOKKOS_TEST_VISIBLE_DEVICES( diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp index 8cd95a24bff079a3b93e7b264195c97409b10206..be19e6386bed36e47d34444189e6700b96bd985c 100644 --- a/packages/kokkos/core/unit_test/TestRange.hpp +++ b/packages/kokkos/core/unit_test/TestRange.hpp @@ -60,33 +60,10 @@ struct TestRange { Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), *this); - { - using ThisType = TestRange<ExecSpace, ScheduleType>; - std::string label("parallel_for"); - Kokkos::Impl::ParallelConstructName<ThisType, void> pcn(label); - ASSERT_EQ(pcn.get(), label); - std::string empty_label(""); - Kokkos::Impl::ParallelConstructName<ThisType, void> empty_pcn( - empty_label); - ASSERT_EQ(empty_pcn.get(), typeid(ThisType).name()); - } - Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyInitTag>(0, N), *this); - { - using ThisType = TestRange<ExecSpace, ScheduleType>; - std::string label("parallel_for"); - Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> pcn(label); - ASSERT_EQ(pcn.get(), label); - std::string empty_label(""); - Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> empty_pcn( - empty_label); - ASSERT_EQ(empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + - typeid(VerifyInitTag).name()); - } - Kokkos::deep_copy(host_flags, m_flags); int error_count = 0; @@ -202,7 +179,6 @@ struct TestRange { } void test_dynamic_policy() { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) auto const N_no_implicit_capture = N; using policy_t = Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; @@ -288,7 +264,6 @@ struct TestRange { //} } } -#endif } }; diff --git a/packages/kokkos/core/unit_test/TestRangePolicyCTAD.cpp b/packages/kokkos/core/unit_test/TestRangePolicyCTAD.cpp new file mode 100644 index 0000000000000000000000000000000000000000..20288e2b40a2bff4ddd215dac4bba33e08b338f6 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestRangePolicyCTAD.cpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include "Kokkos_Core_fwd.hpp" + +namespace { + +struct TestRangePolicyCTAD { + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + + [[maybe_unused]] static int concurrency() { return 0; } + }; + static_assert(Kokkos::is_execution_space_v<SomeExecutionSpace>); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline auto i64 = int64_t(); + [[maybe_unused]] static inline auto i32 = int32_t(); + [[maybe_unused]] static inline auto cs = Kokkos::ChunkSize(0); + [[maybe_unused]] static inline auto des = Kokkos::DefaultExecutionSpace(); + [[maybe_unused]] static inline auto nes = + ImplicitlyConvertibleToDefaultExecutionSpace(); + [[maybe_unused]] static inline auto ses = SomeExecutionSpace(); + + // RangePolicy() + + [[maybe_unused]] static inline auto rp = Kokkos::RangePolicy{}; + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rp)>); + + // RangePolicy(index_type, index_type) + + [[maybe_unused]] static inline auto rpi64i64 = Kokkos::RangePolicy(i64, i64); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i64)>); + + [[maybe_unused]] static inline auto rpi64i32 = Kokkos::RangePolicy(i64, i32); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i32)>); + + [[maybe_unused]] static inline auto rpi32i64 = Kokkos::RangePolicy(i32, i64); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i64)>); + + [[maybe_unused]] static inline auto rpi32i32 = Kokkos::RangePolicy(i32, i32); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i32)>); + + // RangePolicy(index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpi64i64cs = + Kokkos::RangePolicy(i64, i64, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i64cs)>); + + [[maybe_unused]] static inline auto rpi64i32cs = + Kokkos::RangePolicy(i64, i32, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi64i32cs)>); + + [[maybe_unused]] static inline auto rpi32i64cs = + Kokkos::RangePolicy(i32, i64, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i64cs)>); + + [[maybe_unused]] static inline auto rpi32i32cs = + Kokkos::RangePolicy(i32, i32, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpi32i32cs)>); + + // RangePolicy(execution_space, index_type, index_type) + + [[maybe_unused]] static inline auto rpdesi64i64 = + Kokkos::RangePolicy(des, i64, i64); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi64i64)>); + + [[maybe_unused]] static inline auto rpdesi32i32 = + Kokkos::RangePolicy(des, i32, i32); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi32i32)>); + + [[maybe_unused]] static inline auto rpnesi64i64 = + Kokkos::RangePolicy(nes, i64, i64); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi64i64)>); + + [[maybe_unused]] static inline auto rpnesi32i32 = + Kokkos::RangePolicy(nes, i32, i32); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi32i32)>); + + [[maybe_unused]] static inline auto rpsesi64i64 = + Kokkos::RangePolicy(ses, i64, i64); + static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>, + decltype(rpsesi64i64)>); + + [[maybe_unused]] static inline auto rpsesi32i32 = + Kokkos::RangePolicy(ses, i32, i32); + static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>, + decltype(rpsesi32i32)>); + + // RangePolicy(execution_space, index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpdesi64i64cs = + Kokkos::RangePolicy(des, i64, i64, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi64i64cs)>); + + [[maybe_unused]] static inline auto rpdesi32i32cs = + Kokkos::RangePolicy(des, i32, i32, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpdesi32i32cs)>); + + [[maybe_unused]] static inline auto rpnesi64i64cs = + Kokkos::RangePolicy(nes, i64, i64, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi64i64cs)>); + + [[maybe_unused]] static inline auto rpnesi32i32cs = + Kokkos::RangePolicy(nes, i32, i32, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<>, decltype(rpnesi32i32cs)>); + + [[maybe_unused]] static inline auto rpsesi64i64cs = + Kokkos::RangePolicy(ses, i64, i64, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>, + decltype(rpsesi64i64cs)>); + + [[maybe_unused]] static inline auto rpsesi32i32cs = + Kokkos::RangePolicy(ses, i32, i32, cs); + static_assert(std::is_same_v<Kokkos::RangePolicy<SomeExecutionSpace>, + decltype(rpsesi32i32cs)>); + +}; // TestRangePolicyCTAD struct + +// To eliminate maybe_unused warning on some compilers + +[[maybe_unused]] const Kokkos::DefaultExecutionSpace nestodes = + TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); + +[[maybe_unused]] const auto sesconcurrency = + TestRangePolicyCTAD::ses.concurrency(); + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp index 0a7e59ed980c4b47ba965c02c07d2c378453689e..ebd4128db7014e44d1cd84c4ce5eab36d7afd521 100644 --- a/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp +++ b/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp @@ -18,6 +18,10 @@ #include <Kokkos_Core.hpp> +#include <regex> +#include <limits> +#include <type_traits> + namespace { TEST(TEST_CATEGORY, range_policy_runtime_parameters) { @@ -70,4 +74,269 @@ TEST(TEST_CATEGORY, range_policy_runtime_parameters) { } } +TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using Policy = Kokkos::RangePolicy<TEST_EXECSPACE>; + using ChunkSize = Kokkos::ChunkSize; + + std::string msg = + "Kokkos::RangePolicy bounds error: The lower bound (100) is greater than " + "the upper bound (90).\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // escape the parentheses in the regex to match the error message + msg = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ASSERT_DEATH({ (void)Policy(100, 90); }, msg); + + ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); }, + msg); +#else + + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(100, 90); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + +#endif +} + +struct W { // round-trip conversion check for narrowing should "fire" + W(int const* ptr) : val_(*ptr) {} + W(int) : val_(0) {} + operator int() const { return val_; } + + int val_; +}; + +TEST(TEST_CATEGORY_DEATH, range_policy_round_trip_conversion_fires) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using Policy = Kokkos::RangePolicy<>; + + static_assert(std::is_convertible_v<W, Policy::index_type>); + static_assert(std::is_convertible_v<Policy::index_type, W>); + + int const n = 1; + [[maybe_unused]] std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_DEATH((void)Policy(0, W(&n)), msg); +#else + ::testing::internal::CaptureStderr(); + (void)Policy(0, W(&n)); + auto s = std::string(::testing::internal::GetCapturedStderr()); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + ASSERT_NE(s.find(msg), std::string::npos) << msg; + } else +#endif + ASSERT_TRUE(s.empty()); +#endif +} + +struct B { // round-trip conversion would not compile + B(int const* ptr) : val_(*ptr) {} + operator int() const { return val_; } + + int val_; +}; + +TEST(TEST_CATEGORY, range_policy_one_way_convertible_bounds) { + using Policy = Kokkos::RangePolicy<>; + using IndexType = Policy::index_type; + + static_assert(std::is_convertible_v<B, IndexType>); + static_assert(!std::is_convertible_v<IndexType, B>); + + int const n = 1; + Policy policy(0, B(&n)); + EXPECT_EQ(policy.begin(), static_cast<IndexType>(0)); + EXPECT_EQ(policy.end(), static_cast<IndexType>(1)); +} + +TEST(TEST_CATEGORY_DEATH, range_policy_check_sign_changes) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using UInt32Policy = + Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::IndexType<std::uint32_t>>; + + [[maybe_unused]] std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + { + std::int64_t n = std::numeric_limits<std::int64_t>::max(); + ASSERT_DEATH((void)UInt32Policy(0, n), msg); + } + { + std::int64_t n = std::numeric_limits<std::int64_t>::min(); + ASSERT_DEATH((void)UInt32Policy(n, 0), msg); + } +#else + { + ::testing::internal::CaptureStderr(); + std::int64_t n = std::numeric_limits<std::int64_t>::max(); + (void)UInt32Policy(0, n); + auto s = std::string(::testing::internal::GetCapturedStderr()); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + ASSERT_NE(s.find(msg), std::string::npos) << msg; + } +#endif + } + { + ::testing::internal::CaptureStderr(); + std::int64_t n = std::numeric_limits<std::int64_t>::min(); + (void)UInt32Policy(n, 0); + auto s = std::string(::testing::internal::GetCapturedStderr()); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + ASSERT_NE(s.find(msg), std::string::npos) << msg; + } +#endif + } +#endif +} + +TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) { + using UIntIndexType = Kokkos::IndexType<unsigned>; + using IntIndexType = Kokkos::IndexType<int>; + using UIntPolicy = Kokkos::RangePolicy<TEST_EXECSPACE, UIntIndexType>; + using IntPolicy = Kokkos::RangePolicy<TEST_EXECSPACE, IntIndexType>; + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed on a bound (), which may not preserve its original value.\n"; + + [[maybe_unused]] auto get_error_msg = [](auto str, auto val) { + return str.insert(str.find("(") + 1, std::to_string(val).c_str()); + }; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10); }, + get_error_msg(expected, test_val)); + } + { + unsigned test_val = std::numeric_limits<unsigned>::max(); + ASSERT_DEATH({ (void)IntPolicy(0u, test_val); }, + get_error_msg(expected, test_val)); + } + { + long long test_val = std::numeric_limits<long long>::max(); + ASSERT_DEATH({ (void)IntPolicy(0LL, test_val); }, + get_error_msg(expected, test_val)); + } + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10, Kokkos::ChunkSize(2)); }, + get_error_msg(expected, test_val)); + } + +#else + { + ::testing::internal::CaptureStderr(); + int test_val = -1; + UIntPolicy policy(test_val, 10); + ASSERT_EQ(policy.begin(), 0u); + ASSERT_EQ(policy.end(), 0u); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } + { + ::testing::internal::CaptureStderr(); + unsigned test_val = std::numeric_limits<unsigned>::max(); + IntPolicy policy(0u, test_val); + ASSERT_EQ(policy.begin(), 0); + ASSERT_EQ(policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } +#endif +} + +constexpr bool test_chunk_size_explicit() { + using ExecutionSpace = TEST_EXECSPACE; + using Kokkos::ChunkSize; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + static_assert(std::is_convertible_v<int, ChunkSize>); + static_assert(std::is_constructible_v<ChunkSize, int>); + // Some execution spaces were implicitly constructible from int + // which made the constructor call ambiguous. + static_assert( + std::is_constructible_v<Kokkos::DefaultExecutionSpace, int> || + std::is_constructible_v< + Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>, int, int, int>); + static_assert(std::is_constructible_v< + Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>, int, int, + ChunkSize>); + static_assert(std::is_constructible_v<Kokkos::RangePolicy<ExecutionSpace>, + ExecutionSpace, int, int, int>); + static_assert(std::is_constructible_v<Kokkos::RangePolicy<ExecutionSpace>, + ExecutionSpace, int, int, ChunkSize>); +#else + static_assert(!std::is_convertible_v<int, ChunkSize>); + static_assert(std::is_constructible_v<ChunkSize, int>); + static_assert( + !std::is_constructible_v< + Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>, int, int, int>); + static_assert(std::is_constructible_v< + Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>, int, int, + ChunkSize>); + static_assert(!std::is_constructible_v<Kokkos::RangePolicy<ExecutionSpace>, + ExecutionSpace, int, int, int>); + static_assert(std::is_constructible_v<Kokkos::RangePolicy<ExecutionSpace>, + ExecutionSpace, int, int, ChunkSize>); +#endif + return true; +} + +static_assert(test_chunk_size_explicit()); + } // namespace diff --git a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp index 975ac8bd7e3c78b19fb5af6138a260024e37bce4..553ddade84889c01d2978f53014466045cbf19df 100644 --- a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp +++ b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp @@ -55,35 +55,12 @@ struct TestRangeRequire { Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), Property()), *this); - { - using ThisType = TestRangeRequire<ExecSpace, ScheduleType, Property>; - std::string label("parallel_for"); - Kokkos::Impl::ParallelConstructName<ThisType, void> pcn(label); - ASSERT_EQ(pcn.get(), label); - std::string empty_label(""); - Kokkos::Impl::ParallelConstructName<ThisType, void> empty_pcn( - empty_label); - ASSERT_EQ(empty_pcn.get(), typeid(ThisType).name()); - } - Kokkos::parallel_for( Kokkos::Experimental::require( Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyInitTag>(0, N), Property()), *this); - { - using ThisType = TestRangeRequire<ExecSpace, ScheduleType, Property>; - std::string label("parallel_for"); - Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> pcn(label); - ASSERT_EQ(pcn.get(), label); - std::string empty_label(""); - Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> empty_pcn( - empty_label); - ASSERT_EQ(empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + - typeid(VerifyInitTag).name()); - } - Kokkos::deep_copy(host_flags, m_flags); int error_count = 0; @@ -214,7 +191,6 @@ struct TestRangeRequire { //---------------------------------------- void test_dynamic_policy() { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) auto const N_no_implicit_capture = N; using policy_t = Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; @@ -300,7 +276,6 @@ struct TestRangeRequire { //} } } -#endif } }; diff --git a/packages/kokkos/core/unit_test/TestRealloc.hpp b/packages/kokkos/core/unit_test/TestRealloc.hpp index 2c9dc5ee47322d375caab016b733216f88fb240b..f30c9e15e1c045ddc78250d249bdee40e83fa87c 100644 --- a/packages/kokkos/core/unit_test/TestRealloc.hpp +++ b/packages/kokkos/core/unit_test/TestRealloc.hpp @@ -144,6 +144,11 @@ void impl_testRealloc() { EXPECT_EQ(oldPointer, newPointer); } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; template <class DeviceType> void testRealloc() { @@ -154,6 +159,14 @@ void testRealloc() { impl_testRealloc<DeviceType, WithoutInitializing>(); // without data initialization } + // Check #6992 fix (no default initialization in realloc without initializing) + { + using view_type = Kokkos::View<NoDefaultConstructor*, DeviceType>; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + realloc_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewRealloc diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp index e1aa851f10203d5a5d2c29a79fd83ed6ab52f4bb..b45d6d321ed5aadf161d2171f8700e91a2362591 100644 --- a/packages/kokkos/core/unit_test/TestReduce.hpp +++ b/packages/kokkos/core/unit_test/TestReduce.hpp @@ -571,20 +571,37 @@ TEST(TEST_CATEGORY, mdrange_combined_reduce) { constexpr uint64_t nw = 1000; uint64_t nsum = (nw / 2) * (nw + 1); + { + int64_t result1 = 0; + int64_t result2 = 0; + int64_t result3 = 0; - int64_t result1 = 0; - int64_t result2 = 0; - int64_t result3 = 0; - - Kokkos::parallel_reduce( - "int_combined_reduce_mdrange", - Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<3>>({{0, 0, 0}}, - {{nw, 1, 1}}), - functor_type(nw), result1, result2, result3); + Kokkos::parallel_reduce( + "int_combined_reduce_mdrange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<3>>({{0, 0, 0}}, + {{nw, 1, 1}}), + functor_type(nw), result1, result2, result3); - ASSERT_EQ(nw, uint64_t(result1)); - ASSERT_EQ(nsum, uint64_t(result2)); - ASSERT_EQ(nsum, uint64_t(result3)); + ASSERT_EQ(nw, uint64_t(result1)); + ASSERT_EQ(nsum, uint64_t(result2)); + ASSERT_EQ(nsum, uint64_t(result3)); + } + { + int64_t result1 = 0; + int64_t result2 = 0; + int64_t result3 = 0; + + Kokkos::parallel_reduce( + "int_combined_reduce_mdrange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<3>, + Kokkos::Schedule<Kokkos::Dynamic>>({{0, 0, 0}}, + {{nw, 1, 1}}), + functor_type(nw), result1, result2, result3); + + ASSERT_EQ(nw, uint64_t(result1)); + ASSERT_EQ(nsum, uint64_t(result2)); + ASSERT_EQ(nsum, uint64_t(result3)); + } } TEST(TEST_CATEGORY, int_combined_reduce_mixed) { @@ -625,4 +642,34 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) { } #endif #endif + +#if defined(NDEBUG) +// the following test was made for: +// https://github.com/kokkos/kokkos/issues/6517 + +struct FunctorReductionWithLargeIterationCount { + KOKKOS_FUNCTION void operator()(const int64_t /*i*/, double& update) const { + update += 1.0; + } +}; + +TEST(TEST_CATEGORY, reduction_with_large_iteration_count) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, + Kokkos::HostSpace>) { + GTEST_SKIP() << "Disabling for host backends"; + } + + const int64_t N = pow(2LL, 39LL) - pow(2LL, 8LL) + 1; + Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::IndexType<int64_t>> p(0, N); + double nu = 0; + Kokkos::parallel_reduce("sample reduction", p, + FunctorReductionWithLargeIterationCount(), nu); + ASSERT_DOUBLE_EQ(nu, double(N)); +} +#endif + } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp index c6b6249bdf1b869917054d61460d292f5732f04c..236a99d29c1d519a15fe23b7131fa40db7fb88ac 100644 --- a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp +++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp @@ -492,27 +492,21 @@ struct TestReduceCombinatoricalInstantiation { template <class... Args> static void AddFunctorLambdaRange(int N, Args... args) { AddFunctor<0, Args...>(N, args...); -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - AddLambdaRange( - N, - std::conditional_t< - std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, - void*, Kokkos::InvalidType>(), - args...); -#endif + AddLambdaRange(N, + std::conditional_t< + std::is_same_v<ExecSpace, Kokkos::DefaultExecutionSpace>, + void*, Kokkos::InvalidType>(), + args...); } template <class... Args> static void AddFunctorLambdaTeam(int N, Args... args) { AddFunctor<1, Args...>(N, args...); -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - AddLambdaTeam( - N, - std::conditional_t< - std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, - void*, Kokkos::InvalidType>(), - args...); -#endif + AddLambdaTeam(N, + std::conditional_t< + std::is_same_v<ExecSpace, Kokkos::DefaultExecutionSpace>, + void*, Kokkos::InvalidType>(), + args...); } template <class... Args> diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp index 957b9a0ca1a7e432ed4f579ec9e7320776e3981e..b6633d02c4aa4c76db7bc3f2dba0a5a5517ea1d8 100644 --- a/packages/kokkos/core/unit_test/TestReducers.hpp +++ b/packages/kokkos/core/unit_test/TestReducers.hpp @@ -19,6 +19,7 @@ #include <limits> #include <Kokkos_Core.hpp> +#include <TestNonTrivialScalarTypes.hpp> //-------------------------------------------------------------------------- @@ -46,6 +47,37 @@ struct TestReducers { void operator()(const int& i, Scalar& value) const { value += values(i); } }; + struct TeamSumFunctor { + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m, Scalar& value) const { + if (m.team_rank() == m.team_size() - 1) value += Scalar(1); + } + }; + + struct TeamSumNestedFunctor { + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + SumFunctor f; + int M, N; + Kokkos::View<Scalar*, ExecSpace> result; + + TeamSumNestedFunctor(SumFunctor& f_, const int M_, const int N_, + Kokkos::View<Scalar*, ExecSpace> result_) + : f(f_), M(M_), N(N_), result(result_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m) const { + const int i = m.league_rank(); + Scalar local_scalar; + Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(m, N), f, reducer_scalar); + result(i) = local_scalar; + } + }; + struct ProdFunctor { Kokkos::View<const Scalar*, ExecSpace> values; @@ -319,6 +351,102 @@ struct TestReducers { value = value || values(i); } }; + + // get number of teams for TeamPolicy depending on the tested type + constexpr static int get_num_teams() { + if constexpr (sizeof(Scalar) == 1) { + return 126; + } else if constexpr (std::is_same_v<Scalar, + Kokkos::Experimental::bhalf_t>) { + return 256; + } + + return 1024; + } + + static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) { +#ifdef KOKKOS_ENABLE_OPENACC + if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::OpenACC> && + (std::is_same_v<Scalar, size_t> || + std::is_same_v<Scalar, double>)) { + return; // FIXME_OPENACC + } +#endif + + Scalar sum_scalar; + Kokkos::View<Scalar, ExecSpace> sum_view("result"); + Kokkos::deep_copy(sum_view, Scalar(1)); + + // Test team policy reduction + { + constexpr int num_teams = get_num_teams(); + TeamSumFunctor tf; + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy<ExecSpace>(num_teams, team_size); + Kokkos::parallel_reduce(team_pol, tf, sum_view); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + } + + // Test TeamThreadRange level reduction with 0 work produces 0 result + { + const int league_size = 1; + Kokkos::View<Scalar*, ExecSpace> result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, 0, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy<ExecSpace>(1, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + ASSERT_EQ(result_h(0), Scalar{0}) << "N: " << N; + } + + // Same test as above, but with inner reduction over N, and league_size=10 + { + const int league_size = 10; + Kokkos::View<Scalar*, ExecSpace> result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, N, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int initial_team_size = + std::is_same_v<ExecSpace, Kokkos::Experimental::OpenMPTarget> ? 32 + : 1; +#else + int initial_team_size = 1; +#endif + auto team_size_max = + Kokkos::TeamPolicy<ExecSpace>(league_size, initial_team_size) + .team_size_max(tnf, Kokkos::ParallelForTag()); + auto team_size = std::min(team_size_max, TEST_EXECSPACE().concurrency()); + auto team_pol = Kokkos::TeamPolicy<ExecSpace>(league_size, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + for (int i = 0; i < result_h.extent_int(0); ++i) { + ASSERT_EQ(result_h(i), reference_sum) << "N: " << N; + } + } + } + static void test_sum(int N) { Kokkos::View<Scalar*, ExecSpace> values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -338,10 +466,9 @@ struct TestReducers { // This mask addresses #4719 for N <= 51. // The mask is not needed for N <= 25. // clang-format on - int mask = - std::is_same<Scalar, Kokkos::Experimental::bhalf_t>::value && N > 25 - ? (int)0xfffffffe - : (int)0xffffffff; + int mask = std::is_same_v<Scalar, Kokkos::Experimental::bhalf_t> && N > 25 + ? (int)0xfffffffe + : (int)0xffffffff; h_values(i) = (Scalar)((rand() % denom) & mask); reference_sum += h_values(i); } @@ -374,6 +501,8 @@ struct TestReducers { ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N; } + test_sum_team_policy(N, f, reference_sum); + { Kokkos::View<Scalar, Kokkos::HostSpace> sum_view("View"); sum_view() = Scalar(1); @@ -694,6 +823,38 @@ struct TestReducers { } } + static void test_minloc_loc_init(int N) { + using reducer_type = Kokkos::MinLoc<Scalar, int>; + using reducer_value_type = typename reducer_type::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::min(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, -1}; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i, reducer_value_type& update) { + auto x = values(i); + if (i % 2 == 0) + return; + else if (x <= update.val) { + update.val = x; + update.loc = i; + } + }, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.val, h_values(0)); + ASSERT_GE(value_loc.loc, 0); + ASSERT_LT(value_loc.loc, N); + } + static void test_maxloc(int N) { using value_type = typename Kokkos::MaxLoc<Scalar, int>::value_type; @@ -795,6 +956,38 @@ struct TestReducers { } } + static void test_maxloc_loc_init(int N) { + using reducer_type = Kokkos::MaxLoc<Scalar, int>; + using reducer_value_type = typename reducer_type::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::max(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, -1}; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i, reducer_value_type& update) { + auto x = values(i); + if (i % 2 == 0) + return; + else if (x >= update.val) { + update.val = x; + update.loc = i; + } + }, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.val, h_values(0)); + ASSERT_GE(value_loc.loc, 0); + ASSERT_LT(value_loc.loc, N); + } + static void test_minmaxloc(int N) { using value_type = typename Kokkos::MinMaxLoc<Scalar, int>::value_type; @@ -983,6 +1176,188 @@ struct TestReducers { } } + static void test_minmaxloc_loc_init(int N) { + using reducer_type = Kokkos::MinMaxLoc<Scalar, int>; + using reducer_value_type = typename reducer_type::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + + auto functor = KOKKOS_LAMBDA(const int i, reducer_value_type& update) { + auto x = values(i); + if (i % 2 == 0) return; + if (x <= update.min_val) { + update.min_val = x; + update.min_loc = i; + } + if (x >= update.max_val) { + update.max_val = x; + update.max_loc = i; + } + }; + + { + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::min(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, 0, -1, -1}; + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), functor, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.min_val, h_values(0)); + ASSERT_EQ(value_loc.max_val, h_values(0)); + ASSERT_GE(value_loc.min_loc, 0); + ASSERT_LT(value_loc.min_loc, N); + ASSERT_GE(value_loc.max_loc, 0); + ASSERT_LT(value_loc.max_loc, N); + } + + { + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::max(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, 0, -1, -1}; + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), functor, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.min_val, h_values(0)); + ASSERT_EQ(value_loc.max_val, h_values(0)); + ASSERT_GE(value_loc.min_loc, 0); + ASSERT_LT(value_loc.min_loc, N); + ASSERT_GE(value_loc.max_loc, 0); + ASSERT_LT(value_loc.max_loc, N); + } + } + + static void test_minmaxfirstlastloc_loc_init(int N) { + using reducer_type = Kokkos::MinMaxFirstLastLoc<Scalar, int>; + using reducer_value_type = typename reducer_type::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + + auto functor = KOKKOS_LAMBDA(const int i, reducer_value_type& update) { + auto x = values(i); + if (i % 2 == 0) return; + if (x <= update.min_val) { + update.min_val = x; + update.min_loc = i; + } + if (x >= update.max_val) { + update.max_val = x; + update.max_loc = i; + } + }; + + { + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::min(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, 0, -1, -1}; + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), functor, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.min_val, h_values(0)); + ASSERT_EQ(value_loc.max_val, h_values(0)); + ASSERT_GE(value_loc.min_loc, 0); + ASSERT_LT(value_loc.min_loc, N); + ASSERT_GE(value_loc.max_loc, 0); + ASSERT_LT(value_loc.max_loc, N); + } + + { + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::max(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, 0, -1, -1}; + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), functor, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.min_val, h_values(0)); + ASSERT_EQ(value_loc.max_val, h_values(0)); + ASSERT_GE(value_loc.min_loc, 0); + ASSERT_LT(value_loc.min_loc, N); + ASSERT_GE(value_loc.max_loc, 0); + ASSERT_LT(value_loc.max_loc, N); + } + } + + static void test_minfirstloc_loc_init(int N) { + using reducer_type = Kokkos::MinFirstLoc<Scalar, int>; + using reducer_value_type = typename reducer_type::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::min(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, -1}; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i, reducer_value_type& update) { + auto x = values(i); + if (i % 2 == 0) + return; + else if (x <= update.val) { + update.val = x; + update.loc = i; + } + }, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.val, h_values(0)); + ASSERT_GE(value_loc.loc, 0); + ASSERT_LT(value_loc.loc, N); + } + + static void test_maxfirstloc_loc_init(int N) { + using reducer_type = Kokkos::MaxFirstLoc<Scalar, int>; + using reducer_value_type = typename reducer_type::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + + for (int i = 0; i < N; ++i) { + h_values(i) = Kokkos::reduction_identity<Scalar>::max(); + } + Kokkos::deep_copy(values, h_values); + + reducer_value_type value_loc{0, -1}; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i, reducer_value_type& update) { + auto x = values(i); + if (i % 2 == 0) + return; + else if (x >= update.val) { + update.val = x; + update.loc = i; + } + }, + reducer_type(value_loc)); + + ASSERT_EQ(value_loc.val, h_values(0)); + ASSERT_GE(value_loc.loc, 0); + ASSERT_LT(value_loc.loc, N); + } + static void test_BAnd(int N) { Kokkos::View<Scalar*, ExecSpace> values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -1191,6 +1566,7 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); + test_minloc_loc_init(3); // FIXME_OPENMPTARGET requires custom reductions. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_minloc_2d(100); @@ -1200,6 +1576,7 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); + test_maxloc_loc_init(3); // FIXME_OPENMPTARGET requires custom reductions. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) test_maxloc_2d(100); @@ -1219,7 +1596,12 @@ struct TestReducers { #endif #else test_minmaxloc(10007); + test_minmaxloc_loc_init(3); test_minmaxloc_2d(100); + + test_minmaxfirstlastloc_loc_init(3); + test_minfirstloc_loc_init(3); + test_maxfirstloc_loc_init(3); #endif #endif } @@ -1234,6 +1616,7 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); + test_minloc_loc_init(3); #if defined(KOKKOS_ENABLE_CUDA) if (!std::is_same_v<ExecSpace, Kokkos::Cuda>) #endif @@ -1246,6 +1629,7 @@ struct TestReducers { #if !defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); + test_maxloc_loc_init(3); #if defined(KOKKOS_ENABLE_CUDA) if (!std::is_same_v<ExecSpace, Kokkos::Cuda>) #endif @@ -1267,7 +1651,12 @@ struct TestReducers { #endif #else test_minmaxloc(10007); + test_minmaxloc_loc_init(3); test_minmaxloc_2d(100); + + test_minmaxfirstlastloc_loc_init(3); + test_minfirstloc_loc_init(3); + test_maxfirstloc_loc_init(3); #endif #endif test_BAnd(35); diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp index 19eaa6d70002e60b5eec78c57f0759c469f9494a..ecf851aa10897491cd0942ea64f3fa18b1f06f7c 100644 --- a/packages/kokkos/core/unit_test/TestReducers_d.hpp +++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp @@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) { TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4); } +TEST(TEST_CATEGORY, reducers_int16_t) { + using ThisTestType = int16_t; + + TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1); + TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2); + TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3); + TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4); + + TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1); + TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2); + TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3); + TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4); +} + #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET) // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to // implicitly-deleted default constructor of 'conv_type' diff --git a/packages/kokkos/core/unit_test/TestResize.hpp b/packages/kokkos/core/unit_test/TestResize.hpp index 13d7e16d5890ca170683f98471c2b8af41b6f7a4..3102d2b9a16873f4f080b6ca8d1aca08b6d74511 100644 --- a/packages/kokkos/core/unit_test/TestResize.hpp +++ b/packages/kokkos/core/unit_test/TestResize.hpp @@ -358,6 +358,12 @@ void impl_testResize() { } } +struct NoDefaultConstructor { + int value; + KOKKOS_FUNCTION + NoDefaultConstructor(int x) : value(x) {} +}; + template <class DeviceType> void testResize() { { @@ -367,6 +373,13 @@ void testResize() { impl_testResize<DeviceType, WithoutInitializing>(); // without data initialization } + { + using view_type = Kokkos::View<NoDefaultConstructor*, DeviceType>; + view_type view_1d_no_default( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "view_1d_no_default"), + 5); + resize_dispatch(WithoutInitializing{}, view_1d_no_default, 3); + } } } // namespace TestViewResize diff --git a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp index c7b0f380231e571a5c3f6aae864c91a43ace8977..af4be322ca1b6e52a4112c5999216e9e56d6caab 100644 --- a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp +++ b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp @@ -31,7 +31,7 @@ struct SharedAllocDestroy { SharedAllocDestroy() = default; SharedAllocDestroy(int* arg) : count(arg) {} - void destroy_shared_allocation() { Kokkos::atomic_increment(count); } + void destroy_shared_allocation() { Kokkos::atomic_inc(count); } }; template <class MemorySpace, class ExecutionSpace> @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, impl_shared_alloc) { #elif (TEST_CATEGORY_NUMBER == 6) // hip test_shared_alloc<Kokkos::HIPSpace, Kokkos::DefaultHostExecutionSpace>(); #elif (TEST_CATEGORY_NUMBER == 7) // sycl - test_shared_alloc<Kokkos::Experimental::SYCLDeviceUSMSpace, + test_shared_alloc<Kokkos::SYCLDeviceUSMSpace, Kokkos::DefaultHostExecutionSpace>(); #elif (TEST_CATEGORY_NUMBER == 8) // openacc test_shared_alloc<Kokkos::Experimental::OpenACCSpace, diff --git a/packages/kokkos/core/unit_test/TestSharedSpace.cpp b/packages/kokkos/core/unit_test/TestSharedSpace.cpp index 3e59b796137efd9341a450f36be6bea0f56dbcd0..1b1e6d715a19d31642ce33b213eb306a70ecbb9c 100644 --- a/packages/kokkos/core/unit_test/TestSharedSpace.cpp +++ b/packages/kokkos/core/unit_test/TestSharedSpace.cpp @@ -109,9 +109,11 @@ TEST(defaultdevicetype, shared_space) { GTEST_SKIP() << "skipping because specified arch does not support page migration"; #endif -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - GTEST_SKIP() - << "skipping because clock_tic is only defined for sycl+intel gpu"; +#if defined(KOKKOS_ENABLE_SYCL) && \ + (!defined(KOKKOS_ARCH_INTEL_GPU) || \ + !defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE)) + GTEST_SKIP() << "skipping because clock_tic is only defined for sycl+intel " + "gpu and with rdc support"; #endif const unsigned int numRepetitions = 10; diff --git a/packages/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp b/packages/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2fad17cb8545aeb249d085873ebf96a19ee2ec4c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSpaceAwareAccessor.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include <type_traits> + +#include <gtest/gtest.h> +#ifndef KOKKOS_ENABLE_CXX17 +#include <concepts> +#endif + +template <class T> +struct funky_data_handle { + T* val; + + KOKKOS_FUNCTION + operator T*() { return val; } + KOKKOS_FUNCTION + operator const T*() const { return val; } +}; + +template <class ElementType> +struct FunkyAcc { + using element_type = ElementType; + using reference = std::conditional_t<std::is_const_v<element_type>, + element_type, element_type&>; + using data_handle_type = funky_data_handle<element_type>; + using offset_policy = Kokkos::default_accessor<element_type>; + KOKKOS_FUNCTION + reference access(data_handle_type p, size_t i) const { return p.val[i]; } + KOKKOS_FUNCTION + element_type* offset(data_handle_type p, size_t i) const { return p.val + i; } +}; + +template <class T, class ExecutionSpace, + class MemorySpace = typename ExecutionSpace::memory_space> +void test_space_aware_accessor() { + using memory_space_t = MemorySpace; + using value_type = std::remove_const_t<T>; + Kokkos::View<value_type*, ExecutionSpace> v("V", 100); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecutionSpace>(0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = i; }); + + int errors; + using acc_t = Kokkos::Impl::SpaceAwareAccessor<memory_space_t, FunkyAcc<T>>; + acc_t acc{}; + typename acc_t::data_handle_type ptr{v.data()}; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecutionSpace>(0, v.extent(0)), + KOKKOS_LAMBDA(int i, int& error) { + if (acc.access(ptr, i) != ptr[i]) error++; + if (acc.offset(ptr, i) != ptr + i) error++; + static_assert(std::is_same_v<typename acc_t::element_type, T>); + if constexpr (std::is_const_v<T>) { + static_assert(std::is_same_v<typename acc_t::reference, T>); + } else { + static_assert(std::is_same_v<typename acc_t::reference, T&>); + } + static_assert(std::is_same_v<typename acc_t::data_handle_type, + funky_data_handle<T>>); + static_assert( + std::is_same_v<typename acc_t::offset_policy, + Kokkos::Impl::SpaceAwareAccessor< + memory_space_t, Kokkos::default_accessor<T>>>); + if constexpr (std::is_const_v<T>) { + static_assert(std::is_same_v<decltype(acc.access(ptr, i)), + std::remove_const_t<T>>); + } else { + static_assert(std::is_same_v<decltype(acc.access(ptr, i)), T&>); + } + static_assert(std::is_same_v<decltype(acc.offset(ptr, i)), T*>); + static_assert(std::is_same_v<decltype(acc.nested_accessor()), + const FunkyAcc<T>&>); + static_assert(std::is_nothrow_move_constructible_v<acc_t>); + static_assert(std::is_nothrow_move_assignable_v<acc_t>); + static_assert(std::is_nothrow_swappable_v<acc_t>); + static_assert( + std::is_same_v<typename acc_t::memory_space, memory_space_t>); + static_assert( + std::is_same_v<typename acc_t::nested_accessor_type, FunkyAcc<T>>); +#ifndef KOKKOS_ENABLE_CXX17 + static_assert(std::copyable<acc_t>); + static_assert(std::is_empty_v<acc_t>); +#endif + }, + errors); + ASSERT_EQ(errors, 0); +} + +void test_space_aware_accessor_conversion() { + using ExecutionSpace = TEST_EXECSPACE; + using memory_space_t = typename ExecutionSpace::memory_space; + using T = float; + using acc_t = Kokkos::Impl::SpaceAwareAccessor<memory_space_t, + Kokkos::default_accessor<T>>; + using const_acc_t = + Kokkos::Impl::SpaceAwareAccessor<memory_space_t, + Kokkos::default_accessor<const T>>; + using int_acc_t = + Kokkos::Impl::SpaceAwareAccessor<memory_space_t, + Kokkos::default_accessor<int>>; + using host_acc_t = + Kokkos::Impl::SpaceAwareAccessor<Kokkos::HostSpace, + Kokkos::default_accessor<T>>; + using anon_acc_t = + Kokkos::Impl::SpaceAwareAccessor<Kokkos::AnonymousSpace, + Kokkos::default_accessor<T>>; + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) { + static_assert(std::is_constructible_v<const_acc_t, acc_t>); + static_assert(std::is_convertible_v<acc_t, const_acc_t>); + static_assert(!std::is_constructible_v<acc_t, const_acc_t>); + static_assert(!std::is_constructible_v<acc_t, int_acc_t>); + static_assert( + std::is_constructible_v<acc_t, host_acc_t> == + Kokkos::Impl::MemorySpaceAccess<memory_space_t, + Kokkos::HostSpace>::assignable); + static_assert( + std::is_constructible_v<host_acc_t, acc_t> == + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + memory_space_t>::assignable); + static_assert(std::is_constructible_v<anon_acc_t, acc_t>); + static_assert(std::is_constructible_v<acc_t, anon_acc_t>); + static_assert(std::is_convertible_v<anon_acc_t, acc_t>); + static_assert(std::is_convertible_v<acc_t, anon_acc_t>); + }); +} + +TEST(TEST_CATEGORY, mdspan_space_aware_accessor) { + using ExecutionSpace = TEST_EXECSPACE; + test_space_aware_accessor<int, ExecutionSpace>(); + test_space_aware_accessor<double, ExecutionSpace>(); + test_space_aware_accessor<const int, ExecutionSpace>(); + test_space_aware_accessor<const double, ExecutionSpace>(); + test_space_aware_accessor<double, ExecutionSpace, Kokkos::AnonymousSpace>(); + test_space_aware_accessor<const int, ExecutionSpace, + Kokkos::AnonymousSpace>(); + test_space_aware_accessor_conversion(); +} diff --git a/packages/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp b/packages/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dcafda6cedfec4c1987f82c44e31c1dfb8d1c439 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSpaceAwareAccessorAccessViolation.hpp @@ -0,0 +1,128 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +#include <gtest/gtest.h> + +template <class MemorySpace, class ExecutionSpace> +struct TestMemoryAccessViolation { + Kokkos::Impl::SpaceAwareAccessor<MemorySpace, Kokkos::default_accessor<int>> + acc; + + KOKKOS_FUNCTION decltype(auto) bad_access() const { + return acc.access(nullptr, 0); + } + + KOKKOS_FUNCTION void operator()(int) const { ++bad_access(); } + + TestMemoryAccessViolation(ExecutionSpace const& s, + std::string const& matcher) { + constexpr bool accessible_from_execution_space = Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/MemorySpace>::accessible; + EXPECT_FALSE(accessible_from_execution_space); + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template <class MemorySpace, class ExecutionSpace> +void test_memory_access_violation(ExecutionSpace const& s, + std::string const& m) { + TestMemoryAccessViolation<MemorySpace, ExecutionSpace>(s, m); +} + +template <class ExecutionSpace> +void test_memory_access_violations_from_host() { + using memory_space_t = typename ExecutionSpace::memory_space; + using exec_space_t = Kokkos::DefaultHostExecutionSpace; + const exec_space_t exec_space{}; + std::string const message = + "Kokkos::SpaceAwareAccessor ERROR: attempt to access inaccessible memory " + "space"; + test_memory_access_violation<memory_space_t, exec_space_t>(exec_space, + message); +} + +template <class ExecutionSpace> +void test_memory_access_violations_from_device() { + using memory_space_t = Kokkos::HostSpace; + using exec_space_t = ExecutionSpace; + const exec_space_t exec_space{}; + std::string const message = + "Kokkos::SpaceAwareAccessor ERROR: attempt to access inaccessible memory " + "space"; + test_memory_access_violation<memory_space_t, exec_space_t>(exec_space, + message); +} + +// FIXME_SYCL +#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) +TEST(TEST_CATEGORY_DEATH, + mdspan_space_aware_accessor_invalid_access_from_host) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (Kokkos::SpaceAccessibility< + /*AccessSpace=*/Kokkos::HostSpace, + /*MemorySpace=*/typename ExecutionSpace::memory_space>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + + test_memory_access_violations_from_host<ExecutionSpace>(); +} +#endif + +TEST(TEST_CATEGORY_DEATH, + mdspan_space_aware_accessor_invalid_access_from_device) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same<ExecutionSpace, Kokkos::SYCL>::value) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenMPTarget>::value) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenACC>::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_memory_access_violations_from_device<ExecutionSpace>(); +} diff --git a/packages/kokkos/core/unit_test/TestStackTrace.hpp b/packages/kokkos/core/unit_test/TestStackTrace.hpp index 4dbe436e93016a8faf37fb785c3dfdc16a452ad5..3cef861f2f4444d6ef329016cf70557a1771e12e 100644 --- a/packages/kokkos/core/unit_test/TestStackTrace.hpp +++ b/packages/kokkos/core/unit_test/TestStackTrace.hpp @@ -136,11 +136,13 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) { TEST(defaultdevicetype, stacktrace_normal) { test_stacktrace(false); } TEST(defaultdevicetype_DeathTest, stacktrace_terminate) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH({ test_stacktrace(true); }, "I am the custom std::terminate handler."); } TEST(defaultdevicetype_DeathTest, stacktrace_generic_term) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH({ test_stacktrace(true, false); }, "Kokkos observes that std::terminate has been called"); } diff --git a/packages/kokkos/core/unit_test/TestSwap.hpp b/packages/kokkos/core/unit_test/TestSwap.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4e98351cf19c733e0a373c80fa86a6b45f7f4b94 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSwap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <type_traits> +#include <utility> + +namespace { + +template <class ExecutionSpace> +struct TestSwap { + KOKKOS_FUNCTION void operator()(int, int& err) const { + { + int a = 1; + int b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int, int)\n"); + ++err; + } + } + { + float a = 1; + float b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(float, float)\n"); + ++err; + } + } + { + int a[3] = {1, 2, 3}; + int b[3] = {4, 5, 6}; + Kokkos::kokkos_swap(a, b); + if (!(a[0] == 4 && a[1] == 5 && a[2] == 6 && b[0] == 1 && b[1] == 2 && + b[2] == 3)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int[3], int[3])\n"); + ++err; + } + } + } + + TestSwap() { + int errors; + Kokkos::parallel_reduce( + "TestSwap", Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this, errors); + EXPECT_EQ(errors, 0); + } +}; + +TEST(TEST_CATEGORY, kokkos_swap) { TestSwap<TEST_EXECSPACE>(); } + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp index e9f3a655686e566204342fdb7c18ac70878dfe5e..dcebcc3855acd405a3b00b7688f51de38f1343cd 100644 --- a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp +++ b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp @@ -24,6 +24,11 @@ #include <iostream> #include <cmath> +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + //============================================================================== // <editor-fold desc="TestFib"> {{{1 @@ -190,7 +195,7 @@ struct TestTaskDependence { Kokkos::respawn(this, f); } else if (1 == m_count) { - Kokkos::atomic_increment(&m_accum()); + Kokkos::atomic_inc(&m_accum()); } } @@ -823,50 +828,12 @@ struct TestMultipleDependence { #undef TEST_SCHEDULER_SUFFIX #endif -#if 0 -#define TEST_SCHEDULER_SUFFIX _fixed_mempool -#define TEST_SCHEDULER \ - Kokkos::SimpleTaskScheduler< \ - TEST_EXECSPACE, \ - Kokkos::Impl::SingleTaskQueue< \ - TEST_EXECSPACE, \ - Kokkos::Impl::default_tasking_memory_space_for_execution_space_t< \ - TEST_EXECSPACE>, \ - Kokkos::Impl::TaskQueueTraitsLockBased, \ - Kokkos::Impl::FixedBlockSizeMemoryPool< \ - Kokkos::Device< \ - TEST_EXECSPACE, \ - Kokkos::Impl:: \ - default_tasking_memory_space_for_execution_space_t< \ - TEST_EXECSPACE>>, \ - 128, 16>>> -#include "TestTaskScheduler_single.hpp" -#undef TEST_SCHEDULER -#undef TEST_SCHEDULER_SUFFIX - -#define TEST_SCHEDULER_SUFFIX _fixed_mempool_multiple -#define TEST_SCHEDULER \ - Kokkos::SimpleTaskScheduler< \ - TEST_EXECSPACE, \ - Kokkos::Impl::MultipleTaskQueue< \ - TEST_EXECSPACE, \ - Kokkos::Impl::default_tasking_memory_space_for_execution_space_t< \ - TEST_EXECSPACE>, \ - Kokkos::Impl::TaskQueueTraitsLockBased, \ - Kokkos::Impl::FixedBlockSizeMemoryPool< \ - Kokkos::Device< \ - TEST_EXECSPACE, \ - Kokkos::Impl:: \ - default_tasking_memory_space_for_execution_space_t< \ - TEST_EXECSPACE>>, \ - 128, 16>>> -#include "TestTaskScheduler_single.hpp" -#undef TEST_SCHEDULER -#undef TEST_SCHEDULER_SUFFIX -#endif - #undef KOKKOS_TEST_WITH_SUFFIX #undef KOKKOS_PP_CAT_IMPL +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif // #if defined( KOKKOS_ENABLE_TASKDAG ) #endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp index 0a40856f937749a5e7cfbf3912d3d9f659280936..c873807cb5df899443bfaedb66bc5e813c2146b3 100644 --- a/packages/kokkos/core/unit_test/TestTeam.hpp +++ b/packages/kokkos/core/unit_test/TestTeam.hpp @@ -279,7 +279,7 @@ class ReduceTeamFunctor { const int thread_size = ind.team_size() * ind.league_size(); const int chunk = (nwork + thread_size - 1) / thread_size; - size_type iwork = chunk * thread_rank; + size_type iwork = static_cast<size_type>(chunk) * thread_rank; const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork; for (; iwork < iwork_end; ++iwork) { @@ -290,6 +290,50 @@ class ReduceTeamFunctor { } }; +template <typename ScalarType, class DeviceType, class ScheduleType> +class ArrayReduceTeamFunctor { + public: + using execution_space = DeviceType; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + using size_type = typename execution_space::size_type; + + using value_type = ScalarType[]; + size_type value_count = 3; + + size_type nwork; + + KOKKOS_INLINE_FUNCTION + ArrayReduceTeamFunctor(const size_type &arg_nwork) : nwork(arg_nwork) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type dst) const { + for (size_type i = 0; i < value_count; ++i) dst[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + void join(value_type dst, const value_type src) const { + for (size_type i = 0; i < value_count; ++i) dst[i] += src[i]; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename policy_type::member_type &team, + value_type dst) const { + const int thread_rank = + team.team_rank() + team.team_size() * team.league_rank(); + const int thread_size = team.team_size() * team.league_size(); + const int chunk = (nwork + thread_size - 1) / thread_size; + + size_type iwork = chunk * thread_rank; + const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork; + + for (; iwork < iwork_end; ++iwork) { + dst[0] += 1; + dst[1] += iwork + 1; + dst[2] += nwork - iwork; + } + } +}; + } // namespace Test namespace { @@ -301,42 +345,82 @@ class TestReduceTeam { using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; using size_type = typename execution_space::size_type; - TestReduceTeam(const size_type &nwork) { run_test(nwork); } - void run_test(const size_type &nwork) { - using functor_type = - Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>; - using value_type = typename functor_type::value_type; - using result_type = - Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; - enum { Count = 3 }; enum { Repeat = 100 }; - value_type result[Repeat]; - const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); policy_type team_exec(nw, 1); - const unsigned team_size = team_exec.team_size_recommended( - functor_type(nwork), Kokkos::ParallelReduceTag()); - const unsigned league_size = (nwork + team_size - 1) / team_size; + { + using functor_type = + Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>; + using value_type = typename functor_type::value_type; + using result_type = + Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; - team_exec = policy_type(league_size, team_size); + value_type result[Repeat]; - for (unsigned i = 0; i < Repeat; ++i) { - result_type tmp(&result[i]); - Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp); + const unsigned team_size = team_exec.team_size_recommended( + functor_type(nwork), Kokkos::ParallelReduceTag()); + const unsigned league_size = (nwork + team_size - 1) / team_size; + + team_exec = policy_type(league_size, team_size); + + for (unsigned i = 0; i < Repeat; ++i) { + result_type tmp(&result[i]); + Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp); + } + + execution_space().fence(); + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = (j == 0) ? nw : nsum; + ASSERT_EQ((ScalarType)correct, result[i].value[j]); + } + } } + } - execution_space().fence(); + void run_array_test(const size_type &nwork) { + enum { Count = 3 }; + enum { Repeat = 100 }; - for (unsigned i = 0; i < Repeat; ++i) { - for (unsigned j = 0; j < Count; ++j) { - const uint64_t correct = 0 == j % 3 ? nw : nsum; - ASSERT_EQ((ScalarType)correct, result[i].value[j]); + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + policy_type team_exec(nw, 1); + + { + using functor_type = + Test::ArrayReduceTeamFunctor<ScalarType, execution_space, + ScheduleType>; + using result_type = Kokkos::View<ScalarType *, Kokkos::HostSpace, + Kokkos::MemoryUnmanaged>; + + ScalarType result[Repeat][Count]; + + const unsigned team_size = team_exec.team_size_recommended( + functor_type(nwork), Kokkos::ParallelReduceTag()); + const unsigned league_size = (nwork + team_size - 1) / team_size; + + team_exec = policy_type(league_size, team_size); + + for (unsigned i = 0; i < Repeat; ++i) { + result_type tmp(&result[i][0], Count); + Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp); + } + + execution_space().fence(); + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + ASSERT_EQ(j ? nsum : nw, static_cast<uint64_t>(result[i][j])) + << "failing at repeat " << i << " and index " << j; + } } } } @@ -381,8 +465,9 @@ class ScanTeamFunctor { void operator()(const typename policy_type::member_type ind, value_type &error) const { if (0 == ind.league_rank() && 0 == ind.team_rank()) { - const int64_t thread_count = ind.league_size() * ind.team_size(); - total() = (thread_count * (thread_count + 1)) / 2; + const int64_t thread_count = + static_cast<int64_t>(ind.league_size()) * ind.team_size(); + total() = (thread_count * (thread_count + 1)) / 2; } // Team max: @@ -595,7 +680,6 @@ struct TestSharedTeam { namespace Test { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) template <class MemorySpace, class ExecSpace, class ScheduleType> struct TestLambdaSharedTeam { TestLambdaSharedTeam() { run(); } @@ -617,7 +701,7 @@ struct TestLambdaSharedTeam { std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32 : 1; #else - int team_size = 1; + int team_size = 1; #endif #ifdef KOKKOS_ENABLE_CUDA @@ -676,7 +760,6 @@ struct TestLambdaSharedTeam { ASSERT_EQ(error_count, 0); } }; -#endif } // namespace Test @@ -807,7 +890,7 @@ struct TestScratchTeam { ? p_type(64 / team_size, team_size) : p_type(8192 / team_size, team_size); #else - team_exec = p_type(8192 / team_size, team_size); + team_exec = p_type(8192 / team_size, team_size); #endif Kokkos::parallel_reduce( @@ -993,7 +1076,7 @@ struct ClassNoShmemSizeFunction { #ifdef KOKKOS_ENABLE_SYCL int team_size = 4; #else - int team_size = 8; + int team_size = 8; #endif int const concurrency = ExecSpace().concurrency(); if (team_size > concurrency) team_size = concurrency; @@ -1117,7 +1200,6 @@ struct ClassWithShmemSizeFunction { template <class ExecSpace, class ScheduleType> void test_team_mulit_level_scratch_test_lambda() { -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors; Kokkos::View<int, ExecSpace> d_errors("Errors"); errors = d_errors; @@ -1181,7 +1263,6 @@ void test_team_mulit_level_scratch_test_lambda() { }, error); ASSERT_EQ(error, 0); -#endif } } // namespace Test @@ -1193,9 +1274,7 @@ struct TestMultiLevelScratchTeam { TestMultiLevelScratchTeam() { run(); } void run() { -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>(); -#endif Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1; c1.run(); @@ -1440,10 +1519,10 @@ struct TestTeamBroadcast<ExecSpace, ScheduleType, T, } template <class ScalarType> - static inline std::enable_if_t<!std::is_integral<ScalarType>::value, void> + static inline std::enable_if_t<!std::is_integral_v<ScalarType>, void> compare_test(ScalarType A, ScalarType B, double epsilon_factor) { - if (std::is_same<ScalarType, double>::value || - std::is_same<ScalarType, float>::value) { + if (std::is_same_v<ScalarType, double> || + std::is_same_v<ScalarType, float>) { ASSERT_NEAR((double)A, (double)B, epsilon_factor * std::abs(A) * std::numeric_limits<ScalarType>::epsilon()); @@ -1453,7 +1532,7 @@ struct TestTeamBroadcast<ExecSpace, ScheduleType, T, } template <class ScalarType> - static inline std::enable_if_t<std::is_integral<ScalarType>::value, void> + static inline std::enable_if_t<std::is_integral_v<ScalarType>, void> compare_test(ScalarType A, ScalarType B, double) { ASSERT_EQ(A, B); } @@ -1663,7 +1742,6 @@ struct TestTeamPolicyHandleByValue { TestTeamPolicyHandleByValue() { test(); } void test() { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const int M = 1, N = 1; Kokkos::View<scalar **, mem_space> a("a", M, N); Kokkos::View<scalar **, mem_space> b("b", M, N); @@ -1678,7 +1756,6 @@ struct TestTeamPolicyHandleByValue { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N), [&](const int j) { a(i, j) += b(i, j); }); }); -#endif } }; @@ -1753,4 +1830,162 @@ struct TestRepeatedTeamReduce { } // namespace Test +namespace Test { + +struct SimpleTestValueType { + using ScalarType = int; + + ScalarType value[2]; +}; + +struct TestTeamReducerFunctor { + using value_type = SimpleTestValueType; + + KOKKOS_INLINE_FUNCTION + void init(value_type &init) const { + init.value[0] = 1; + init.value[1] = 10; + } + + KOKKOS_INLINE_FUNCTION + void join(value_type &dst, value_type const &src) const { + dst.value[0] *= src.value[0]; + dst.value[1] += src.value[1]; + } + + KOKKOS_INLINE_FUNCTION + void final(value_type &dst) const { + dst.value[0] /= -2; + dst.value[1] /= -2; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &update) const { + update.value[0] *= (i + 1); + update.value[1] *= (i + 2); + } +}; + +struct TestTeamReducer { + using reducer = TestTeamReducer; + using value_type = SimpleTestValueType; + + KOKKOS_INLINE_FUNCTION + TestTeamReducer(value_type &val) : local(val) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type &init) const { + init.value[0] = 1; + init.value[1] = 10; + } + + KOKKOS_INLINE_FUNCTION + void join(value_type &dst, value_type const &src) const { + dst.value[0] *= src.value[0]; + dst.value[1] += src.value[1]; + } + + KOKKOS_INLINE_FUNCTION + void final(value_type &dst) const { + dst.value[0] /= -2; + dst.value[1] /= -2; + } + + KOKKOS_INLINE_FUNCTION + value_type &reference() const { return local; } + + value_type &local; +}; + +namespace { + +template <typename ExecSpace> +class TestTeamNestedReducerFunctor { + public: + using execution_space = ExecSpace; + using team_policy_type = Kokkos::TeamPolicy<execution_space>; + using member_type = typename team_policy_type::member_type; + using value_type = SimpleTestValueType; + using functor_type = TestTeamReducerFunctor; + using reducer_type = TestTeamReducer; + using index_type = int; + + void run_test_team_thread() { + auto policy = KOKKOS_LAMBDA(member_type const &member, index_type count) { + return Kokkos::TeamThreadRange(member, count); + }; + run_test_team_policies(policy); + }; + + void run_test_thread_vector() { + auto policy = KOKKOS_LAMBDA(member_type const &member, index_type count) { + return Kokkos::ThreadVectorRange(member, count); + }; + run_test_team_policies(policy); + }; + + void run_test_team_vector() { + auto policy = KOKKOS_LAMBDA(member_type const &member, index_type count) { + return Kokkos::TeamVectorRange(member, count); + }; + run_test_team_policies(policy); + }; + + template <typename Policy> + void run_test_team_policies(Policy &policy) { + constexpr index_type league_size = 3; + constexpr index_type test_count = 8; + + Kokkos::View<value_type[league_size], execution_space> + reducer_functor_result("reducer_functor_result"); + Kokkos::View<value_type[league_size], execution_space> reducer_result( + "reducer_result"); + + Kokkos::parallel_for( + team_policy_type(league_size, Kokkos::AUTO), + KOKKOS_LAMBDA(member_type const &team) { + const int league = team.league_rank(); + + // Using a functor as reducer + value_type result1{}; + Kokkos::parallel_reduce(policy(team, test_count), functor_type{}, + result1); + + // Using a reducer + value_type result2{}; + reducer_type reducer(result2); + Kokkos::parallel_reduce( + policy(team, test_count), + [&](const int i, value_type &update) { + update.value[0] *= (i + 1); + update.value[1] *= (i + 2); + }, + reducer); + + Kokkos::single(Kokkos::PerTeam(team), [=]() { + reducer_functor_result(league).value[0] = result1.value[0]; + reducer_functor_result(league).value[1] = result1.value[1]; + + reducer_result(league).value[0] = result2.value[0]; + reducer_result(league).value[1] = result2.value[1]; + }); + }); + Kokkos::fence(); + + auto test1 = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace{}, reducer_functor_result); + auto test2 = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace{}, reducer_result); + + for (unsigned i = 0; i < test1.extent(0); ++i) { + EXPECT_EQ(test1(i).value[0], test2(i).value[0]); + EXPECT_EQ(test1(i).value[1], test2(i).value[1]); + } + } +}; + +} // namespace + +} // namespace Test + /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp index c395bc0837ce8ba4637174c244b07bae55885e13..5284bf66670f4abda77749ab28288c3d6ad9d6a1 100644 --- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp +++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp @@ -280,7 +280,7 @@ namespace Test { // Test for non-arithmetic type TEST(TEST_CATEGORY, team_broadcast_long_wrapper) { - static_assert(!std::is_arithmetic<long_wrapper>::value, ""); + static_assert(!std::is_arithmetic_v<long_wrapper>); TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, long_wrapper>::test_teambroadcast(0, 1); diff --git a/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp b/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp index 47c2f666c95cbcfc7d2af18044844034190f4eaf..d681b0df47a9061e9d19fd4295d53b1b2be448fa 100644 --- a/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp +++ b/packages/kokkos/core/unit_test/TestTeamCombinedReducers.hpp @@ -19,10 +19,6 @@ namespace { -// Extended lambdas in parallel_for and parallel_reduce will not compile if -// KOKKOS_ENABLE_CUDA_LAMBDA is off -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) - struct TeamTeamCombinedReducer { public: void test_team_thread_range_only_scalars(const int n) { @@ -510,6 +506,4 @@ TEST(TEST_CATEGORY, team_vector_range_combined_reducers) { tester.test_team_vector_range_combined_reducers(9); } -#endif - } // namespace diff --git a/packages/kokkos/core/unit_test/TestTeamMDRange.hpp b/packages/kokkos/core/unit_test/TestTeamMDRange.hpp index 6e65cde0cf88da3082bf8f56b24f49f6cf22ed9d..4905b8a530f20348011660db8e133fed148b8766 100644 --- a/packages/kokkos/core/unit_test/TestTeamMDRange.hpp +++ b/packages/kokkos/core/unit_test/TestTeamMDRange.hpp @@ -148,10 +148,6 @@ struct TestTeamMDParallelFor { } }; -// If KOKKOS_ENABLE_CUDA_LAMBDA is off, extended lambdas used in parallel_for -// and parallel_reduce in these tests will not compile correctly -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) - template <typename ExecSpace> struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { using TeamType = typename Kokkos::TeamPolicy<ExecSpace>::member_type; @@ -169,7 +165,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -202,7 +205,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -236,7 +246,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -272,7 +289,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -310,7 +334,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -350,7 +381,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -420,7 +458,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -457,7 +502,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -496,7 +548,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -536,7 +595,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -579,7 +645,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -620,7 +693,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -653,7 +733,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -687,7 +774,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -723,7 +817,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -761,7 +862,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -801,7 +909,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -908,13 +1023,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k) = fillFlattenedIndex(i, j, k); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange<Kokkos::Rank<2, Direction>, TeamType>( @@ -923,7 +1045,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -952,13 +1080,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange<Kokkos::Rank<3, Direction>, TeamType>( @@ -966,7 +1101,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& threadSum) { threadSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -997,13 +1138,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange<Kokkos::Rank<4, Direction>, TeamType>( @@ -1013,7 +1161,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1045,13 +1199,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange<Kokkos::Rank<5, Direction>, TeamType>( @@ -1061,7 +1222,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1100,13 +1267,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange<Kokkos::Rank<6, Direction>, TeamType>( @@ -1116,7 +1290,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1157,13 +1337,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange<Kokkos::Rank<7, Direction>, TeamType>( @@ -1174,7 +1361,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1207,20 +1400,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange<Kokkos::Rank<2, Direction>, TeamType>( team, n1, n2); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1228,11 +1427,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k); }, threadSum); - - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1263,20 +1460,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange<Kokkos::Rank<3, Direction>, TeamType>( team, n1, n2, n3); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1286,10 +1489,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1321,20 +1523,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange<Kokkos::Rank<4, Direction>, TeamType>( team, n1, n2, n3, n4); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1344,10 +1552,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1384,20 +1591,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange<Kokkos::Rank<5, Direction>, TeamType>( team, n1, n2, n3, n4, n5); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1407,10 +1620,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1451,20 +1663,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange<Kokkos::Rank<6, Direction>, TeamType>( team, n1, n2, n3, n4, n5, n6); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1474,10 +1692,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1510,13 +1727,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange<Kokkos::Rank<3, Direction>, TeamType>( @@ -1527,7 +1751,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& vectorSum) { vectorSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1558,13 +1788,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange<Kokkos::Rank<4, Direction>, TeamType>( @@ -1577,7 +1814,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1609,13 +1852,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange<Kokkos::Rank<5, Direction>, TeamType>( @@ -1628,7 +1878,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1665,13 +1921,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange<Kokkos::Rank<6, Direction>, TeamType>( @@ -1684,7 +1947,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1725,13 +1994,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy<ExecSpace>(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange<Kokkos::Rank<7, Direction>, TeamType>( @@ -1745,7 +2021,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1900,17 +2182,10 @@ TEST(TEST_CATEGORY, TeamThreadMDRangeParallelReduce) { TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) { // FIXME_SYCL sycl::group_barrier doesn't work correctly for non-Intel GPUs #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestThreadVectorMDRangeParallelReduce<TEST_EXECSPACE>:: test_parallel_reduce_for_4D_ThreadVectorMDRange<Left>(dims); TestThreadVectorMDRangeParallelReduce<TEST_EXECSPACE>:: @@ -1940,17 +2215,10 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) { TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { // FIXME_SYCL sycl::group_barrier doesn't work correctly for non-Intel GPUs #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) + if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>) GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestTeamVectorMDRangeParallelReduce<TEST_EXECSPACE>:: test_parallel_reduce_for_4D_TeamVectorMDRange<Left>(dims); TestTeamVectorMDRangeParallelReduce<TEST_EXECSPACE>:: @@ -1977,7 +2245,5 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { test_parallel_reduce_for_8D_TeamVectorMDRange<Right>(smallDims); } -#endif - } // namespace TeamMDRange } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp b/packages/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp new file mode 100644 index 0000000000000000000000000000000000000000..db6448895e4c3ae960e336c21935fd56274c5d7b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamMDRangePolicyCTAD.cpp @@ -0,0 +1,199 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +namespace { + +struct TestTeamThreadMDRangeCTAD { + using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>; + using TeamHandle = TeamPolicy::member_type; + + KOKKOS_FUNCTION void operator()(TeamHandle const& team_handle) const { + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<2>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<3>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<4>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<5>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<6>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<7>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamThreadMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamThreadMDRange<Kokkos::Rank<8>, TeamHandle>, + decltype(md_range)>); + } + } + + TestTeamThreadMDRangeCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +struct TestTeamVectorMDRangeCTAD { + using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>; + using TeamHandle = TeamPolicy::member_type; + + KOKKOS_FUNCTION void operator()(TeamHandle const& team_handle) const { + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<2>, TeamHandle>, + decltype(md_range)>); + } + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<4>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<5>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<6>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<7>, TeamHandle>, + decltype(md_range)>); + } + + { + Kokkos::TeamVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0, 0); + static_assert( + std::is_same_v<Kokkos::TeamVectorMDRange<Kokkos::Rank<8>, TeamHandle>, + decltype(md_range)>); + } + } + + TestTeamVectorMDRangeCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +struct TestThreadVectorMDRangeCTAD { + using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>; + using TeamHandle = TeamPolicy::member_type; + + template <class PolicyTypeExpected, class PolicyTypeToCheck> + KOKKOS_FUNCTION static void check_types( + [[maybe_unused]] PolicyTypeToCheck const& team_handle) { + static_assert(std::is_same_v<PolicyTypeExpected, PolicyTypeToCheck>); + } + + KOKKOS_FUNCTION void operator()(TeamHandle const& team_handle) const { + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<2>, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<3>, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<4>, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<5>, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<6>, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<7>, TeamHandle>>( + md_range); + } + + { + Kokkos::ThreadVectorMDRange md_range(team_handle, 0, 0, 0, 0, 0, 0, 0, 0); + check_types<Kokkos::ThreadVectorMDRange<Kokkos::Rank<8>, TeamHandle>>( + md_range); + } + } + + TestThreadVectorMDRangeCTAD() { + Kokkos::parallel_for(TeamPolicy(0, Kokkos::AUTO), *this); + } +}; + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp b/packages/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07aaeae819ef99d1b2b94561de88a32246ad9337 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamPolicyCTAD.cpp @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +namespace { + +struct TestTeamPolicyCTAD { + template <typename... Ts> + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v<SomeExecutionSpace>); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int i; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 warning about was declared but never referenced + TestTeamPolicyCTAD() { maybe_unused(des, notEs, ses, i, notEsToDes); } + + // Default construction deduces to TeamPolicy<> + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, decltype(Kokkos::TeamPolicy{})>); + + // Execution space not provided deduces to TeamPolicy<> + + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, decltype(Kokkos::TeamPolicy(i, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(i, i, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(i, i, Kokkos::AUTO))>); + + // DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(des, i, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(des, i, i, i))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(des, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(des, i, i, Kokkos::AUTO))>); + + // Convertible to DefaultExecutionSpace deduces to TeamPolicy<> + + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(notEs, i, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(notEs, i, i, i))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(notEs, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy( + notEs, i, Kokkos::AUTO, Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<>, + decltype(Kokkos::TeamPolicy(notEs, i, i, Kokkos::AUTO))>); + + // SES != DefaultExecutionSpace deduces to TeamPolicy<SES> + + static_assert(std::is_same_v<Kokkos::TeamPolicy<SomeExecutionSpace>, + decltype(Kokkos::TeamPolicy(ses, i, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<SomeExecutionSpace>, + decltype(Kokkos::TeamPolicy(ses, i, i, i))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<SomeExecutionSpace>, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<SomeExecutionSpace>, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, i))>); + static_assert(std::is_same_v<Kokkos::TeamPolicy<SomeExecutionSpace>, + decltype(Kokkos::TeamPolicy(ses, i, Kokkos::AUTO, + Kokkos::AUTO))>); + static_assert( + std::is_same_v<Kokkos::TeamPolicy<SomeExecutionSpace>, + decltype(Kokkos::TeamPolicy(ses, i, i, Kokkos::AUTO))>); +}; + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp index 5b0bfdb1755c3cbae1062ed84b902d4d3743452f..9d89f757086068bb7611aaf2b37d94314a0ae06c 100644 --- a/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp +++ b/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp @@ -20,11 +20,24 @@ namespace { +struct SomeTag {}; + +struct FunctorFor { + KOKKOS_FUNCTION + void operator()( + Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type const&) const {} + + KOKKOS_FUNCTION + void operator()( + SomeTag, Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type const&) const {} +}; + template <typename Policy> void test_run_time_parameters() { int league_size = 131; using ExecutionSpace = typename Policy::execution_space; + using ParallelTag = Kokkos::ParallelForTag; int team_size = 4 < ExecutionSpace().concurrency() ? 4 : ExecutionSpace().concurrency(); #ifdef KOKKOS_ENABLE_HPX @@ -44,6 +57,8 @@ void test_run_time_parameters() { ASSERT_EQ(p1.team_size(), team_size); ASSERT_GT(p1.chunk_size(), 0); ASSERT_EQ(p1.scratch_size(0), 0u); + ASSERT_GT(p1.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p1.team_size_recommended(FunctorFor(), ParallelTag()), 0); Policy p2 = p1.set_chunk_size(chunk_size); ASSERT_EQ(p1.league_size(), league_size); @@ -112,6 +127,8 @@ void test_run_time_parameters() { Policy p8; // default constructed ASSERT_EQ(p8.league_size(), 0); ASSERT_EQ(p8.scratch_size(0), 0u); + ASSERT_GT(p8.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p8.team_size_recommended(FunctorFor(), ParallelTag()), 0); p8 = p3; // call assignment operator ASSERT_EQ(p3.league_size(), league_size); ASSERT_EQ(p3.team_size(), team_size); @@ -121,11 +138,25 @@ void test_run_time_parameters() { ASSERT_EQ(p8.team_size(), team_size); ASSERT_EQ(p8.chunk_size(), chunk_size); ASSERT_EQ(p8.scratch_size(0), size_t(scratch_size)); + + Policy p9(league_size, Kokkos::AUTO); + ASSERT_EQ(p9.league_size(), league_size); + ASSERT_GT(p9.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p9.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p10(league_size, team_size, Kokkos::AUTO); + ASSERT_EQ(p10.league_size(), league_size); + ASSERT_EQ(p10.team_size(), team_size); + ASSERT_GT(p10.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p10.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p11(league_size, Kokkos::AUTO, Kokkos::AUTO); + ASSERT_EQ(p11.league_size(), league_size); + ASSERT_GT(p11.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p11.team_size_recommended(FunctorFor(), ParallelTag()), 0); } TEST(TEST_CATEGORY, team_policy_runtime_parameters) { - struct SomeTag {}; - using TestExecSpace = TEST_EXECSPACE; using DynamicSchedule = Kokkos::Schedule<Kokkos::Dynamic>; using LongIndex = Kokkos::IndexType<long>; diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp index 8bdd5e8432201964d93ea4bb89a5ee335724b34f..86d0757a74d2898ba46ce57fe46568c88aca2e21 100644 --- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp +++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp @@ -35,14 +35,18 @@ TEST(TEST_CATEGORY, team_long_reduce) { Kokkos::Experimental::OpenMPTarget>::value) #endif { - TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(0); - TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0); - TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(3); - TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(3); - TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( - 100000); - TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( - 100000); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_test(0); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_test(0); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_test(3); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_test(3); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_test(100000); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_test(100000); } } @@ -52,18 +56,70 @@ TEST(TEST_CATEGORY, team_double_reduce) { Kokkos::Experimental::OpenMPTarget>::value) #endif { - TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( - 0); - TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( - 0); - TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( - 3); - TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( - 3); - TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( - 100000); - TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( - 100000); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_test(0); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_test(0); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_test(3); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_test(3); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_test(100000); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_test(100000); + } +} + +TEST(TEST_CATEGORY, team_long_array_reduce) { +// FIXME_MSVC FIXME_32BIT Test is known to fail +#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_32BIT) + GTEST_SKIP() << "Test know to fail for MSVC or 32-bit builds"; +#endif + +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET: Not implemented + if constexpr (!std::is_same<TEST_EXECSPACE, + Kokkos::Experimental::OpenMPTarget>::value) +#endif + { + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_array_test(0); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_array_test(0); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_array_test(3); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_array_test(3); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_array_test(100000); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_array_test(100000); + } +} + +TEST(TEST_CATEGORY, team_double_array_reduce) { +// FIXME_MSVC FIXME_32BIT Test is known to fail +#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_32BIT) + GTEST_SKIP() << "Test know to fail for MSVC or 32-bit builds"; +#endif + +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET: Not implemented + if constexpr (!std::is_same<TEST_EXECSPACE, + Kokkos::Experimental::OpenMPTarget>::value) +#endif + { + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_array_test(0); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_array_test(0); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_array_test(3); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_array_test(3); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >{} + .run_array_test(100000); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >{} + .run_array_test(100000); } } @@ -120,5 +176,18 @@ TEST(TEST_CATEGORY, repeated_team_reduce) { TestRepeatedTeamReduce<TEST_EXECSPACE>(); } +TEST(TEST_CATEGORY, nested_team_reduce_functor_as_reducer) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET: Not implemented + if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value) + GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not " + "properly implemented"; +#endif + { + TestTeamNestedReducerFunctor<TEST_EXECSPACE>().run_test_team_thread(); + TestTeamNestedReducerFunctor<TEST_EXECSPACE>().run_test_thread_vector(); + TestTeamNestedReducerFunctor<TEST_EXECSPACE>().run_test_team_vector(); + } +} + } // namespace Test #endif diff --git a/packages/kokkos/core/unit_test/TestTeamScan.hpp b/packages/kokkos/core/unit_test/TestTeamScan.hpp index 833683227eb0d4331a907f2476d30892ca514980..847985630556e68f26f4da0b44e3da3325811971 100644 --- a/packages/kokkos/core/unit_test/TestTeamScan.hpp +++ b/packages/kokkos/core/unit_test/TestTeamScan.hpp @@ -15,7 +15,6 @@ //@HEADER #include <Kokkos_Core.hpp> -#include <impl/Kokkos_Stacktrace.hpp> #include <cstdio> #include <cstdint> #include <sstream> @@ -54,14 +53,14 @@ struct TestTeamScan { }); } - auto operator()(int32_t _M, int32_t _N) { + auto operator()(int32_t M_, int32_t N_) { std::stringstream ss; - ss << Kokkos::Impl::demangle(typeid(*this).name()); - ss << "(/*M=*/" << _M << ", /*N=*/" << _N << ")"; + ss << Kokkos::Impl::TypeInfo<decltype(*this)>::name(); + ss << "(/*M=*/" << M_ << ", /*N=*/" << N_ << ")"; std::string const test_id = ss.str(); - M = _M; - N = _N; + M = M_; + N = N_; a_d = view_type("a_d", M, N); a_r = view_type("a_r", M, N); @@ -172,14 +171,14 @@ struct TestTeamScanRetVal { Kokkos::single(Kokkos::PerTeam(team), [&]() { a_s(leagueRank) = accum; }); } - auto operator()(int32_t _M, int32_t _N) { + auto operator()(int32_t M_, int32_t N_) { std::stringstream ss; - ss << Kokkos::Impl::demangle(typeid(*this).name()); - ss << "(/*M=*/" << _M << ", /*N=*/" << _N << ")"; + ss << Kokkos::Impl::TypeInfo<decltype(*this)>::name(); + ss << "(/*M=*/" << M_ << ", /*N=*/" << N_ << ")"; std::string const test_id = ss.str(); - M = _M; - N = _N; + M = M_; + N = N_; a_d = view_2d_type("a_d", M, N); a_r = view_2d_type("a_r", M, N); a_s = view_1d_type("a_s", M); diff --git a/packages/kokkos/core/unit_test/TestTeamScratch.hpp b/packages/kokkos/core/unit_test/TestTeamScratch.hpp index c072a87c7b2a4a08466f3e435bfdd229448e533d..728241e2e58987083f54b517207b364e26d426cf 100644 --- a/packages/kokkos/core/unit_test/TestTeamScratch.hpp +++ b/packages/kokkos/core/unit_test/TestTeamScratch.hpp @@ -30,7 +30,6 @@ TEST(TEST_CATEGORY, team_scratch_request) { TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(); } -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) TEST(TEST_CATEGORY, team_lambda_shared_request) { TestLambdaSharedTeam<Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(); @@ -38,7 +37,6 @@ TEST(TEST_CATEGORY, team_lambda_shared_request) { Kokkos::Schedule<Kokkos::Dynamic> >(); } TEST(TEST_CATEGORY, scratch_align) { TestScratchAlignment<TEST_EXECSPACE>(); } -#endif TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); } diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp index 39122736ed7e29c9b4809920f0ce1d6d8b6a7d50..ee0ba46e0dfd1528fae46fc902453eaf7541103c 100644 --- a/packages/kokkos/core/unit_test/TestTeamVector.hpp +++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp @@ -782,7 +782,6 @@ namespace Test { // Computes y^T*A*x // ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar ) -#if (!defined(KOKKOS_ENABLE_CUDA)) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template <typename ScalarType, class DeviceType> class TestTripleNestedReduce { public: @@ -882,21 +881,6 @@ class TestTripleNestedReduce { } }; -#else // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( - // KOKKOS_ENABLE_CUDA_LAMBDA ) - -template <typename ScalarType, class DeviceType> -class TestTripleNestedReduce { - public: - using execution_space = DeviceType; - using size_type = typename execution_space::size_type; - - TestTripleNestedReduce(const size_type &, const size_type, const size_type &, - const size_type) {} -}; - -#endif - namespace VectorScanReducer { enum class ScanType : bool { Inclusive, Exclusive }; @@ -980,7 +964,7 @@ struct checkScan { const std::string label = (scan_type == ScanType::Inclusive ? std::string("inclusive") : std::string("exclusive")) + - "Scan" + typeid(Reducer).name(); + "Scan" + std::string(Kokkos::Impl::TypeInfo<Reducer>::name()); Kokkos::parallel_for(label, policy, *this); Kokkos::fence(); @@ -1012,7 +996,6 @@ struct checkScan { }; } // namespace VectorScanReducer -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0))); ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(1))); @@ -1028,9 +1011,7 @@ TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(11))); ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(12))); } -#endif -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, triple_nested_parallelism) { // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80 @@ -1039,14 +1020,14 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) { #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA) if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) #elif defined(KOKKOS_ENABLE_SYCL) - if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value) + if (!std::is_same<TEST_EXECSPACE, Kokkos::SYCL>::value) #endif { TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32); TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16); } #if defined(KOKKOS_ENABLE_SYCL) - if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value) + if (!std::is_same<TEST_EXECSPACE, Kokkos::SYCL>::value) #endif { TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33); @@ -1055,7 +1036,6 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) { TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16); TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16); } -#endif TEST(TEST_CATEGORY, parallel_scan_with_reducers) { using T = double; @@ -1064,11 +1044,8 @@ TEST(TEST_CATEGORY, parallel_scan_with_reducers) { constexpr int n = 1000000; constexpr int n_vector_range = 100; -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) { - GTEST_SKIP() << "All but max inclusive scan differ at index 101"; - } +#ifdef KOKKOS_IMPL_32BIT + GTEST_SKIP() << "Failing KOKKOS_IMPL_32BIT"; // FIXME_32BIT #endif checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range, diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp index 06139eb345d9cdf3ebd100404f033e71b63683bf..d844756bf4acc0cd40c9e471196497874b116365 100644 --- a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp +++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp @@ -436,12 +436,6 @@ namespace Test { TEST(TEST_CATEGORY, team_teamvector_range) { ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(0))); -#if defined(KOKKOS_ENABLE_CUDA) && \ - defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 - if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) { - GTEST_SKIP() << "Disabling 2/3rd of the test for now"; - } -#endif ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(1))); // FIXME_OPENMPTARGET - Use of kokkos reducers currently results in runtime // memory errors. diff --git a/packages/kokkos/core/unit_test/TestTypeInfo.cpp b/packages/kokkos/core/unit_test/TestTypeInfo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..93c4ae1909277b8ef9d026427c2a2dadce61d8f0 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTypeInfo.cpp @@ -0,0 +1,74 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_TypeInfo.hpp> + +#include <type_traits> + +namespace { + +using Kokkos::Impl::TypeInfo; + +struct Foo {}; +using FooAlias = Foo; +enum Bar { BAR_0, BAR_1, BAR_2 }; +union Baz { + int i; + float f; +}; + +[[maybe_unused]] auto func = [](int) {}; // < line 33 +// ^ column 30 +using Lambda = decltype(func); + +// clang-format off +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) +// can't do much +// it looks like that there is 1st an EDG pass and then a host pass and they cannot both agree on what the type info is +#elif defined(__EDG__) || (defined(__NVCC__) && defined(__CUDA_ARCH__)) +static_assert(TypeInfo<Foo>::name() == "<unnamed>::Foo"); +static_assert(TypeInfo<FooAlias>::name() == "<unnamed>::Foo"); +static_assert(TypeInfo<Bar>::name() == "<unnamed>::Bar"); +static_assert(TypeInfo<Baz>::name() == "<unnamed>::Baz"); +static_assert(TypeInfo<Lambda>::name() == "lambda [](int)->void"); +#elif defined(__clang__) +static_assert(TypeInfo<Foo>::name() == "(anonymous namespace)::Foo"); +static_assert(TypeInfo<FooAlias>::name() == "(anonymous namespace)::Foo"); +static_assert(TypeInfo<Bar>::name() == "(anonymous namespace)::Bar"); +static_assert(TypeInfo<Baz>::name() == "(anonymous namespace)::Baz"); +static_assert(TypeInfo<Lambda>::name() == "(anonymous namespace)::(lambda at " __FILE__ ":33:30)"); +#elif defined(__GNUC__) +static_assert(TypeInfo<Foo>::name() == "{anonymous}::Foo"); +static_assert(TypeInfo<FooAlias>::name() == "{anonymous}::Foo"); +static_assert(TypeInfo<Bar>::name() == "{anonymous}::Bar"); +static_assert(TypeInfo<Baz>::name() == "{anonymous}::Baz"); +static_assert(TypeInfo<Lambda>::name() == "{anonymous}::<lambda(int)>"); +#elif defined(_MSC_VER) +static_assert(TypeInfo<Foo>::name() == "struct `anonymous-namespace'::Foo"); +static_assert(TypeInfo<FooAlias>::name() == "struct `anonymous-namespace'::Foo"); +static_assert(TypeInfo<Bar>::name() == "enum `anonymous-namespace'::Bar"); +static_assert(TypeInfo<Baz>::name() == "union `anonymous-namespace'::Baz"); +#ifndef KOKKOS_ENABLE_CXX17 +static_assert(TypeInfo<Lambda>::name().starts_with("class `anonymous-namespace'::<lambda_")); +// underscore followed by some 32-bit hash that seems sensitive to the content of the current source code file +static_assert(TypeInfo<Lambda>::name().ends_with(">")); +#endif +#else +#error how did I ended up here? +#endif +// clang-format on + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestTypeList.cpp b/packages/kokkos/core/unit_test/TestTypeList.cpp index 7057b8c3f70f1a01cb2ae4e0c0d1d804ee9864a2..e0f45d2454a3f496c4d87067798417704fcea9db 100644 --- a/packages/kokkos/core/unit_test/TestTypeList.cpp +++ b/packages/kokkos/core/unit_test/TestTypeList.cpp @@ -25,21 +25,21 @@ using TypeList223NoVoid = Kokkos::Impl::type_list<bool, bool, char, short, int>; // concat_type_list using ConcatTypeList2 = Kokkos::Impl::concat_type_list_t<TypeList2>; -static_assert(std::is_same<TypeList2, ConcatTypeList2>::value, +static_assert(std::is_same_v<TypeList2, ConcatTypeList2>, "concat_type_list of a single type_list failed"); using ConcatTypeList223 = Kokkos::Impl::concat_type_list_t<TypeList2, TypeList2, TypeList3>; -static_assert(std::is_same<TypeList223, ConcatTypeList223>::value, +static_assert(std::is_same_v<TypeList223, ConcatTypeList223>, "concat_type_list of three type_lists failed"); // filter_type_list using FilterTypeList223Void = Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223>; -static_assert(std::is_same<TypeList223Void, FilterTypeList223Void>::value, +static_assert(std::is_same_v<TypeList223Void, FilterTypeList223Void>, "filter_type_list with predicate value==true failed"); using FilterTypeList223NoVoid = Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223, false>; -static_assert(std::is_same<TypeList223NoVoid, FilterTypeList223NoVoid>::value, +static_assert(std::is_same_v<TypeList223NoVoid, FilterTypeList223NoVoid>, "filter_type_list with predicate value==false failed"); diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp index b1f9d30c1fc95a7aefe41ee2919a68e2bf603506..bed2586457a9e7c2da3f40fa298c76f887152b7a 100644 --- a/packages/kokkos/core/unit_test/TestUtilities.hpp +++ b/packages/kokkos/core/unit_test/TestUtilities.hpp @@ -25,20 +25,18 @@ namespace Test { void test_is_specialization_of() { using Kokkos::Impl::is_specialization_of; - static_assert(is_specialization_of<Kokkos::pair<float, int>, Kokkos::pair>{}, - ""); - static_assert(!is_specialization_of<Kokkos::View<int*>, Kokkos::pair>{}, ""); - static_assert(is_specialization_of<Kokkos::View<int*>, Kokkos::View>{}, ""); + static_assert(is_specialization_of<Kokkos::pair<float, int>, Kokkos::pair>{}); + static_assert(!is_specialization_of<Kokkos::View<int*>, Kokkos::pair>{}); + static_assert(is_specialization_of<Kokkos::View<int*>, Kokkos::View>{}); // NOTE Not removing cv-qualifiers - static_assert(!is_specialization_of<Kokkos::View<int*> const, Kokkos::View>{}, - ""); + static_assert( + !is_specialization_of<Kokkos::View<int*> const, Kokkos::View>{}); // NOTE Would not compile because Kokkos::Array takes a non-type template // parameter - // static_assert(is_specialization_of<Kokkos::Array<int, 4>, Kokkos::Array>{}, - // ""); + // static_assert(is_specialization_of<Kokkos::Array<int, 4>, + // Kokkos::Array>{}); // But this is fine of course - static_assert(!is_specialization_of<Kokkos::Array<float, 2>, Kokkos::pair>{}, - ""); + static_assert(!is_specialization_of<Kokkos::Array<float, 2>, Kokkos::pair>{}); } namespace { @@ -94,37 +92,37 @@ void test_is_scoped_enum() { using Kokkos::Impl::is_scoped_enum_v; static_assert(!is_scoped_enum<int>{}); - static_assert(!is_scoped_enum<int>::value); + static_assert(!is_scoped_enum<int>::value); // NOLINT static_assert(!is_scoped_enum_v<int>); static_assert( is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<int>>); static_assert(!is_scoped_enum<Class>{}); - static_assert(!is_scoped_enum<Class>::value); + static_assert(!is_scoped_enum<Class>::value); // NOLINT static_assert(!is_scoped_enum_v<Class>); static_assert( is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<Class>>); static_assert(!is_scoped_enum<Enum>{}); - static_assert(!is_scoped_enum<Enum>::value); + static_assert(!is_scoped_enum<Enum>::value); // NOLINT static_assert(!is_scoped_enum_v<Enum>); static_assert( is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<Enum>>); static_assert(!is_scoped_enum<EnumBool>{}); - static_assert(!is_scoped_enum<EnumBool>::value); + static_assert(!is_scoped_enum<EnumBool>::value); // NOLINT static_assert(!is_scoped_enum_v<EnumBool>); static_assert(is_public_unambiguous_base_of_v<std::false_type, is_scoped_enum<EnumBool>>); static_assert(is_scoped_enum<ScopedEnum>{}); - static_assert(is_scoped_enum<ScopedEnum>::value); + static_assert(is_scoped_enum<ScopedEnum>::value); // NOLINT static_assert(is_scoped_enum_v<ScopedEnum>); static_assert(is_public_unambiguous_base_of_v<std::true_type, is_scoped_enum<ScopedEnum>>); static_assert(is_scoped_enum<ScopedEnumShort>{}); - static_assert(is_scoped_enum<ScopedEnumShort>::value); + static_assert(is_scoped_enum<ScopedEnumShort>::value); // NOLINT static_assert(is_scoped_enum_v<ScopedEnumShort>); static_assert( is_public_unambiguous_base_of_v<std::true_type, diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp index ffc500e4a9adb184b1239586c798c9119a9a35f5..55af24afe02ac48014d21ba42fa5d0a10e03c1f2 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp @@ -47,7 +47,7 @@ struct TestViewOperator { enum { N = 1000 }; enum { D = 3 }; - using view_type = Kokkos::View<T * [D], execution_space>; + using view_type = Kokkos::View<T *[D], execution_space>; const view_type v1; const view_type v2; @@ -741,8 +741,7 @@ struct TestViewMirror { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, - typename DeviceType::memory_space>::value + std::is_same_v<Kokkos::HostSpace, typename DeviceType::memory_space> ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -768,8 +767,7 @@ struct TestViewMirror { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same<Kokkos::HostSpace, - typename DeviceType::memory_space>::value + std::is_same_v<Kokkos::HostSpace, typename DeviceType::memory_space> ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -837,18 +835,15 @@ struct TestViewMirror { view_const_cast(v)); } - template <class MemoryTraits, class Space> + template <class View> struct CopyUnInit { - using mirror_view_type = typename Kokkos::Impl::MirrorViewType< - Space, double *, Layout, Kokkos::HostSpace, MemoryTraits>::view_type; - - mirror_view_type a_d; + View a_d; KOKKOS_INLINE_FUNCTION - CopyUnInit(mirror_view_type &a_d_) : a_d(a_d_) {} + explicit CopyUnInit(View const &a_d_) : a_d(a_d_) {} KOKKOS_INLINE_FUNCTION - void operator()(const typename Space::size_type i) const { + void operator()(const typename View::size_type i) const { a_d(i) = (double)(10 - i); } }; @@ -866,8 +861,7 @@ struct TestViewMirror { int equal_ptr_h_d = (a_h.data() == a_d.data()) ? 1 : 0; constexpr int is_same_memspace = - std::is_same<Kokkos::HostSpace, - typename DeviceType::memory_space>::value + std::is_same_v<Kokkos::HostSpace, typename DeviceType::memory_space> ? 1 : 0; @@ -875,7 +869,8 @@ struct TestViewMirror { Kokkos::parallel_for( Kokkos::RangePolicy<typename DeviceType::execution_space>(0, int(10)), - CopyUnInit<MemoryTraits, DeviceType>(a_d)); + // decltype required for Intel classics, that doesn't recognize the CTAD + CopyUnInit<decltype(a_d)>(a_d)); Kokkos::deep_copy(a_h, a_d); @@ -909,10 +904,10 @@ class TestViewAPI { using dView0 = Kokkos::View<T, device>; using dView1 = Kokkos::View<T *, device>; - using dView2 = Kokkos::View<T * [N1], device>; - using dView3 = Kokkos::View<T * [N1][N2], device>; - using dView4 = Kokkos::View<T * [N1][N2][N3], device>; - using const_dView4 = Kokkos::View<const T * [N1][N2][N3], device>; + using dView2 = Kokkos::View<T *[N1], device>; + using dView3 = Kokkos::View<T *[N1][N2], device>; + using dView4 = Kokkos::View<T *[N1][N2][N3], device>; + using const_dView4 = Kokkos::View<const T *[N1][N2][N3], device>; using dView4_unmanaged = Kokkos::View<T ****, device, Kokkos::MemoryUnmanaged>; using host = typename dView0::host_mirror_space; @@ -957,9 +952,11 @@ class TestViewAPI { using view_type = Kokkos::View<int, host>; using mirror_type = typename view_type::HostMirror; - static_assert(std::is_same<typename view_type::memory_space, - typename mirror_type::memory_space>::value, - ""); + static_assert(std::is_same_v<typename view_type::HostMirror, + typename view_type::host_mirror_type>); + + static_assert(std::is_same_v<typename view_type::memory_space, + typename mirror_type::memory_space>); view_type a("a"); mirror_type am = Kokkos::create_mirror_view(a); @@ -1005,26 +1002,26 @@ class TestViewAPI { hView3 hv_3("dView3::HostMirror", N0); hView4 hv_4("dView4::HostMirror", N0); - dView0 dv_0_1(nullptr, 0); + dView0 dv_0_1(nullptr); dView0 dv_0_2(hv_0.label(), hv_0.layout()); - dView1 dv_1_1(nullptr, 0); + dView1 dv_1_1(nullptr, N0); dView1 dv_1_2(hv_1.label(), hv_1.layout()); - dView2 dv_2_1(nullptr, 0); + dView2 dv_2_1(nullptr, N0); dView2 dv_2_2(hv_2.label(), hv_2.layout()); - dView3 dv_3_1(nullptr, 0); + dView3 dv_3_1(nullptr, N0); dView3 dv_3_2(hv_3.label(), hv_3.layout()); - dView4 dv_4_1(nullptr, 0); + dView4 dv_4_1(nullptr, N0); dView4 dv_4_2(hv_4.label(), hv_4.layout()); } static void run_test_contruction_from_layout_2() { using dView3_0 = Kokkos::View<T ***, device>; - using dView3_1 = Kokkos::View<T * * [N1], device>; - using dView3_2 = Kokkos::View<T * [N1][N2], device>; + using dView3_1 = Kokkos::View<T **[N2], device>; + using dView3_2 = Kokkos::View<T *[N1][N2], device>; using dView3_3 = Kokkos::View<T[N0][N1][N2], device>; dView3_0 v_0("v_0", N0, N1, N2); @@ -1340,6 +1337,40 @@ class TestViewAPI { ASSERT_EQ(dz.data(), nullptr); } + struct test_refcount_poison_copy_functor { + using view_type = Kokkos::View<double *>; + explicit test_refcount_poison_copy_functor(view_type v) : view(v) {} + + test_refcount_poison_copy_functor( + const test_refcount_poison_copy_functor &other) + : view(other.view) { + throw std::bad_alloc(); + } + + KOKKOS_INLINE_FUNCTION void operator()(int) const {} + + view_type view; + }; + + static void run_test_refcount_exception() { + using view_type = typename test_refcount_poison_copy_functor::view_type; + view_type original("original", N0); + ASSERT_EQ(original.use_count(), 1); + + // test_refcount_poison_copy_functor throws during copy construction + try { + Kokkos::parallel_for( + Kokkos::RangePolicy<typename DeviceType::execution_space>(0, N0), + test_refcount_poison_copy_functor(original)); + } catch (const std::bad_alloc &) { + } + + // Ensure refcounting is enabled, we should increment here + auto copy = original; + ASSERT_EQ(original.use_count(), 2); + ASSERT_EQ(copy.use_count(), 2); + } + static void run_test_deep_copy_empty() { // Check Deep Copy of LayoutLeft to LayoutRight { @@ -1360,8 +1391,8 @@ class TestViewAPI { // Check Deep Copy of two empty 2D views { - Kokkos::View<double * [3], Kokkos::LayoutRight> d; - Kokkos::View<double * [3], Kokkos::LayoutRight, Kokkos::HostSpace> h; + Kokkos::View<double *[3], Kokkos::LayoutRight> d; + Kokkos::View<double *[3], Kokkos::LayoutRight, Kokkos::HostSpace> h; Kokkos::deep_copy(d, h); Kokkos::deep_copy(h, d); } @@ -1540,53 +1571,6 @@ class TestViewAPI { typename multivector_type::const_type cmvX(cmv); typename const_multivector_type::const_type ccmvX(cmv); } - - static void run_test_error() { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<typename dView1::memory_space, - Kokkos::Experimental::OpenMPTargetSpace>::value) - return; -#endif -// FIXME_MSVC_WITH_CUDA -// This test doesn't behave as expected on Windows with CUDA -#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) - if (std::is_same<typename dView1::memory_space, - Kokkos::CudaUVMSpace>::value) - return; -#endif - auto alloc_size = std::numeric_limits<size_t>::max() - 42; - try { - auto should_always_fail = dView1("hello_world_failure", alloc_size); - } catch (std::runtime_error const &error) { - // TODO once we remove the conversion to std::runtime_error, catch the - // appropriate Kokkos error here - std::string msg = error.what(); - ASSERT_PRED_FORMAT2(::testing::IsSubstring, "hello_world_failure", msg); - ASSERT_PRED_FORMAT2(::testing::IsSubstring, - typename device::memory_space{}.name(), msg); - // Can't figure out how to make assertions either/or, so we'll just use - // an if statement here for now. Test failure message will be a bit - // misleading, but developers should figure out what's going on pretty - // quickly. - if (msg.find("is not a valid size") != std::string::npos) { - ASSERT_PRED_FORMAT2(::testing::IsSubstring, "is not a valid size", msg); - } else -#ifdef KOKKOS_ENABLE_SYCL - if (msg.find("insufficient memory") != std::string::npos) -#endif - { - ASSERT_PRED_FORMAT2(::testing::IsSubstring, "insufficient memory", msg); - } - // SYCL cannot tell the reason why a memory allocation failed -#ifdef KOKKOS_ENABLE_SYCL - else { - // Otherwise, there has to be some sort of "unknown error" error - ASSERT_PRED_FORMAT2(::testing::IsSubstring, - "because of an unknown error.", msg); - } -#endif - } - } }; } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_b.hpp b/packages/kokkos/core/unit_test/TestViewAPI_b.hpp index e66ed70ef49b63e659e038759ba7e4a2c77e86d5..b372e822705d840456e62a5e83bd7989ffa22b1e 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI_b.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI_b.hpp @@ -18,6 +18,36 @@ namespace Test { +TEST(TEST_CATEGORY, view_layout_left_with_stride) { + Kokkos::LayoutLeft ll(10, 20); + ll.stride = 15; + Kokkos::View<int**, Kokkos::LayoutLeft> a("A", ll); + ASSERT_EQ(static_cast<int>(a.extent(0)), 10); + ASSERT_EQ(static_cast<int>(a.extent(1)), 20); + ASSERT_EQ(static_cast<int>(a.stride(0)), 1); + ASSERT_EQ(static_cast<int>(a.stride(1)), 15); + + auto ll2 = a.layout(); + ASSERT_EQ(static_cast<int>(ll2.dimension[0]), 10); + ASSERT_EQ(static_cast<int>(ll2.dimension[1]), 20); + ASSERT_EQ(static_cast<int>(ll2.stride), 15); +} + +TEST(TEST_CATEGORY, view_layout_right_with_stride) { + Kokkos::LayoutRight lr(10, 20); + lr.stride = 25; + Kokkos::View<int**, Kokkos::LayoutRight> a("A", lr); + ASSERT_EQ(static_cast<int>(a.extent(0)), 10); + ASSERT_EQ(static_cast<int>(a.extent(1)), 20); + ASSERT_EQ(static_cast<int>(a.stride(0)), 25); + ASSERT_EQ(static_cast<int>(a.stride(1)), 1); + + auto lr2 = a.layout(); + ASSERT_EQ(static_cast<int>(lr2.dimension[0]), 10); + ASSERT_EQ(static_cast<int>(lr2.dimension[1]), 20); + ASSERT_EQ(static_cast<int>(lr2.stride), 25); +} + TEST(TEST_CATEGORY, view_api_b) { TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_a(); TestViewAPI<double, TEST_EXECSPACE>::run_test_mirror(); diff --git a/packages/kokkos/core/unit_test/TestViewAPI_c.hpp b/packages/kokkos/core/unit_test/TestViewAPI_c.hpp index 5efbd95bc94e4d287452ec4fb6651ea077d94fd8..042da1e984279493031318c184bbcf51f0ed195e 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI_c.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI_c.hpp @@ -19,6 +19,7 @@ namespace Test { TEST(TEST_CATEGORY, view_api_c) { + TestViewAPI<double, TEST_EXECSPACE>::run_test_refcount_exception(); TestViewAPI<double, TEST_EXECSPACE>::run_test_deep_copy_empty(); TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_b(); } diff --git a/packages/kokkos/core/unit_test/TestViewAPI_d.hpp b/packages/kokkos/core/unit_test/TestViewAPI_d.hpp index 08d21f54499fcd77b4ba02b6c4a098e855b4d8f7..075ac3329c0a095c2f079e7d93576b62d499bd2a 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI_d.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI_d.hpp @@ -26,11 +26,4 @@ TEST(TEST_CATEGORY, view_api_d) { TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_c(); } -TEST(TEST_CATEGORY, view_allocation_error) { -#if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) - GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; -#endif - TestViewAPI<double, TEST_EXECSPACE>::run_test_error(); -} - } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp index 2e416d032055bbc760b49d5da4e62a841def9186..e717eeae4081c3e62a962be2fee718033c6a72bb 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp @@ -34,10 +34,9 @@ TEST(TEST_CATEGORY, view_remap) { std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::HIP>::value, \ Kokkos::HIPHostPinnedSpace, TEST_EXECSPACE>::type #elif defined(KOKKOS_ENABLE_SYCL) -#define EXECSPACE \ - std::conditional< \ - std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value, \ - Kokkos::Experimental::SYCLHostUSMSpace, TEST_EXECSPACE>::type +#define EXECSPACE \ + std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::SYCL>::value, \ + Kokkos::SYCLHostUSMSpace, TEST_EXECSPACE>::type #elif defined(KOKKOS_ENABLE_OPENMPTARGET) #define EXECSPACE Kokkos::HostSpace #else @@ -45,13 +44,13 @@ TEST(TEST_CATEGORY, view_remap) { #endif using output_type = - Kokkos::View<double * [N1][N2][N3], Kokkos::LayoutRight, EXECSPACE>; + Kokkos::View<double* [N1][N2][N3], Kokkos::LayoutRight, EXECSPACE>; using input_type = - Kokkos::View<int* * [N2][N3], Kokkos::LayoutLeft, EXECSPACE>; + Kokkos::View<int** [N2][N3], Kokkos::LayoutLeft, EXECSPACE>; using diff_type = - Kokkos::View<int * [N0][N2][N3], Kokkos::LayoutLeft, EXECSPACE>; + Kokkos::View<int* [N0][N2][N3], Kokkos::LayoutLeft, EXECSPACE>; output_type output("output", N0); input_type input("input", N0, N1); @@ -152,7 +151,6 @@ inline void test_anonymous_space() { host_anon_assign_view(i) = 142; } Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> d_view("d_view", 100, 10); -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace, int>(0, 100), KOKKOS_LAMBDA(int i) { int* ptr = &(d_view(i, 0)); @@ -167,7 +165,6 @@ inline void test_anonymous_space() { } }); Kokkos::fence(); -#endif } TEST(TEST_CATEGORY, anonymous_space) { test_anonymous_space(); } @@ -180,8 +177,8 @@ struct TestViewOverloadResolution { static int foo(Kokkos::View<const double***, ExecSpace> /*a*/) { return 3; } // Overload based on compile time dimensions - static int bar(Kokkos::View<double * [3], ExecSpace> /*a*/) { return 4; } - static int bar(Kokkos::View<double * [4], ExecSpace> /*a*/) { return 5; } + static int bar(Kokkos::View<double* [3], ExecSpace> /*a*/) { return 4; } + static int bar(Kokkos::View<double* [4], ExecSpace> /*a*/) { return 5; } static void test_function_overload() { Kokkos::View<double**, typename ExecSpace::execution_space::array_layout, @@ -196,8 +193,8 @@ struct TestViewOverloadResolution { ExecSpace> b("B", 10, 3, 4); int data_type_2 = foo(b); - Kokkos::View<double * [3], - typename ExecSpace::execution_space::array_layout, ExecSpace> + Kokkos::View<double* [3], typename ExecSpace::execution_space::array_layout, + ExecSpace> c(a); int static_extent = bar(c); ASSERT_EQ(1, data_type_1); diff --git a/packages/kokkos/core/unit_test/TestViewBadAlloc.hpp b/packages/kokkos/core/unit_test/TestViewBadAlloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1707a9d5d25de246f55f2ba0589748b516d0fa70 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewBadAlloc.hpp @@ -0,0 +1,92 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +#include <gtest/gtest.h> + +namespace { + +template <class MemorySpace> +void test_view_bad_alloc() { + bool did_throw = false; + auto too_large = std::numeric_limits<size_t>::max() - 42; + std::string label = "my_label"; + try { + auto should_always_fail = + Kokkos::View<double *, MemorySpace>(label, too_large); + } catch (std::runtime_error const &error) { + std::string msg = error.what(); + ASSERT_PRED_FORMAT2( + ::testing::IsSubstring, + std::string(MemorySpace::name()) + " memory space failed to allocate", + msg) + << "memory space name is missing"; + ASSERT_PRED_FORMAT2(::testing::IsSubstring, + std::string("(label=\"") + label + "\")", msg) + << "label is missing"; + did_throw = true; + } + ASSERT_TRUE(did_throw); +} + +TEST(TEST_CATEGORY, view_bad_alloc) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = ExecutionSpace::memory_space; +#if defined(__has_feature) +#if __has_feature(address_sanitizer) + if (std::is_same_v<MemorySpace, Kokkos::HostSpace>) { + GTEST_SKIP() << "AddressSanitizer detects allocating too much memory " + "preventing our checks to run"; + } +#endif +#endif +#if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) + if (std::is_same_v<ExecutionSpace, Kokkos::HIP>) { + GTEST_SKIP() + << "ROCm 5.3 segfaults when trying to allocate too much memory"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same_v<ExecutionSpace, Kokkos::Experimental::OpenACC>) { + GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; + } +#endif + +#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) + if (std::is_same_v<ExecutionSpace, Kokkos::Cuda>) { + GTEST_SKIP() << "MSVC/CUDA segfaults when allocating too much memory"; + } +#endif + + test_view_bad_alloc<MemorySpace>(); + + constexpr bool execution_space_is_device = + std::is_same_v<ExecutionSpace, Kokkos::DefaultExecutionSpace> && + !std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::DefaultHostExecutionSpace>; + + if constexpr (execution_space_is_device) { +#ifdef KOKKOS_HAS_SHARED_SPACE + test_view_bad_alloc<Kokkos::SharedSpace>(); +#endif +#ifdef KOKKOS_HAS_SHARED_HOST_PINNED_SPACE + test_view_bad_alloc<Kokkos::SharedHostPinnedSpace>(); +#endif + } +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp index 3bfc93aadacf721e5c6c77ffd54208eb3113acaf..a4735b2998874f76e920396c77d51674434341ff 100644 --- a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp +++ b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp @@ -147,6 +147,40 @@ TEST(TEST_CATEGORY, view_copy_tests) { Kokkos::deep_copy(s_a, hs_a); ASSERT_TRUE(run_check(s_a, 6)); } + } else { + // These copies won't succeed, but they should each throw + // an exception whose message contains the view labels, + // and the names of the views' memory spaces. + // + // Note: original a,b both have the same device type, + // and their mirrors have the same device type. + using memory_space = typename decltype(a)::memory_space; + using mirror_memory_space = typename decltype(h_a)::memory_space; + bool threw = false; + std::string msg; + try { + Kokkos::deep_copy(hs_b, s_b); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(hs_b.label()), std::string::npos); + ASSERT_NE(msg.find(s_b.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); + threw = false; + try { + Kokkos::deep_copy(s_a, hs_a); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(s_a.label()), std::string::npos); + ASSERT_NE(msg.find(hs_a.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); } // Contiguous copies diff --git a/packages/kokkos/core/unit_test/TestViewCopy_b.hpp b/packages/kokkos/core/unit_test/TestViewCopy_b.hpp index 4b155c4c7fc4dcb9b83dc90fcfe1ef4b28dd14b1..4e8bca66dcbb1134fd282606b85c733c458a83d5 100644 --- a/packages/kokkos/core/unit_test/TestViewCopy_b.hpp +++ b/packages/kokkos/core/unit_test/TestViewCopy_b.hpp @@ -206,14 +206,14 @@ TEST(TEST_CATEGORY, view_copy_degenerated) { Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_def_1; Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> - v_um_1(reinterpret_cast<int*>(-1), 0); + v_um_1(reinterpret_cast<int*>(-1), 0); Kokkos::View<int*, TEST_EXECSPACE> v_m_def_1; Kokkos::View<int*, TEST_EXECSPACE> v_m_1("v_m_1", 0); Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v_um_def_2; Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> - v_um_2(reinterpret_cast<int*>(-1), 0); + v_um_2(reinterpret_cast<int*>(-1), 0); Kokkos::View<int*, TEST_EXECSPACE> v_m_def_2; Kokkos::View<int*, TEST_EXECSPACE> v_m_2("v_m_2", 0); diff --git a/packages/kokkos/core/unit_test/TestViewCopy_c.hpp b/packages/kokkos/core/unit_test/TestViewCopy_c.hpp new file mode 100644 index 0000000000000000000000000000000000000000..758af13c7df0da516ef33cd1081d9b332eeeb774 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewCopy_c.hpp @@ -0,0 +1,434 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace { +// Do not rely on deep_copy(0) as we want to test it! +template <class ViewType, class ExecSpace> +void reset_view(const ExecSpace& space, ViewType& a, int magic) { + auto policy = Kokkos::RangePolicy<ExecSpace>(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + Kokkos::parallel_for( + "TestViewCopy::ResetView", policy, + KOKKOS_LAMBDA(int i) { a.data()[i] = magic; }); +} + +template <class ViewType, class ExecSpace> +size_t compute_overall_sum(const ExecSpace& space, ViewType& a) { + auto policy = Kokkos::RangePolicy<ExecSpace>(space, 0, a.span()); + + assert(a.span_is_contiguous()); + + typename ViewType::value_type sum = 0; + Kokkos::parallel_reduce( + "TestViewCopy::ComputeSum", policy, + KOKKOS_LAMBDA(int i, int& lcl_sum) { lcl_sum += a.data()[i]; }, sum); + + return static_cast<size_t>(sum); +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 0>* = nullptr) { + auto policy = Kokkos::RangePolicy<ExecSpace>(space, 0, 1); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank0", policy, + KOKKOS_LAMBDA(int, bool& local_check) { local_check &= (a() == magic); }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 1>* = nullptr) { + auto policy = Kokkos::RangePolicy<ExecSpace>(space, 0, a.extent(0)); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank1", policy, + KOKKOS_LAMBDA(int i, bool& local_check) { + local_check &= (a(i) == magic); + }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 2>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<2>, ExecSpace>( + space, {0, 0}, {a.extent(0), a.extent(1)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank2", policy, + KOKKOS_LAMBDA(int i0, int i1, bool& local_check) { + local_check &= (a(i0, i1) == magic); + }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 3>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<3>, ExecSpace>( + space, {0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank3", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, bool& local_check) { + local_check &= (a(i0, i1, i2) == magic); + }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 4>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<4>, ExecSpace>( + space, {0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank4", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, bool& local_check) { + local_check &= (a(i0, i1, i2, i3) == magic); + }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 5>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<5>, ExecSpace>( + space, {0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank5", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4) == magic); + }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 6>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<6>, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set; // Uninitialized, set by parallel_reduce + + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank6", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5) == magic); + }, + Kokkos::LAnd<bool>(all_elements_are_set)); + + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 7>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<6>, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(6); ++outer) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank7", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, outer) == magic); + }, + Kokkos::LAnd<bool>(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + return all_elements_are_set; +} + +template <typename ExecSpace, typename DT, typename... DP> +bool check_magic_value( + const ExecSpace& space, const Kokkos::View<DT, DP...>& a, int magic, + std::enable_if_t<Kokkos::ViewTraits<DT, DP...>::rank == 8>* = nullptr) { + auto policy = Kokkos::MDRangePolicy<Kokkos::Rank<6>, ExecSpace>( + space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3), a.extent(4), + a.extent(5)}); + + bool all_elements_are_set = true; + + for (size_t outer = 0; outer < a.extent(7); ++outer) { + for (size_t inner = 0; inner < a.extent(6); ++inner) { + bool all_local_elements_are_set; // Uninitialized, set by parallel_reduce + Kokkos::parallel_reduce( + "TestViewCopy::CheckMagicValueRank8", policy, + KOKKOS_LAMBDA(int i0, int i1, int i2, int i3, int i4, int i5, + bool& local_check) { + local_check &= (a(i0, i1, i2, i3, i4, i5, inner, outer) == magic); + }, + Kokkos::LAnd<bool>(all_local_elements_are_set)); + + all_elements_are_set = all_elements_are_set && all_local_elements_are_set; + } + } + return all_elements_are_set; +} + +template <class ExecSpace, class ViewType> +bool view_fill_test(const ExecSpace& space, ViewType& a, int magic) { + Kokkos::deep_copy(space, a, magic); +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + // FIXME_OPENMPTARGET Does not work with Land reducer + return true; +#else // KOKKOS_ENABLE_OPENMPTARGET + return check_magic_value(space, a, magic); +#endif // KOKKOS_ENABLE_OPENMPTARGET +} + +template <class Layout, class Space> +void run_test() { + int magic = 19; + + using ViewType = Kokkos::View<int********, Layout, Space>; + // Create views with different lengths for each dimension + // We want to test if all loops are over the correct dimensions + // We use prime numbers to make sure that the strides are different + ViewType a_decreasing("a", 23, 19, 17, 13, 11, 7, 5, 3); + // We also test with increasing strides to catch more "out-of-bounds" errors + // within subviews. + ViewType a_increasing("a", 3, 5, 7, 11, 13, 17, 19, 23); + + using exec_space = typename Space::execution_space; + auto space = exec_space(); + + // Use subviews in the tests to have cases with different ranks and + // non-contiguous memory + // Tests have two parts: + // 1. Fill the subview with a magic value and check that all elements are set + // 2. Check if only the subview is set by summing all elements in the view and + // comparing to the subview size times the magic value + + // Rank 0 + { + auto sub_dec = Kokkos::subview(a_decreasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + static_cast<size_t>(magic)); + + auto sub_inc = Kokkos::subview(a_increasing, 0, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), + static_cast<size_t>(magic)); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 1 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, 0, 0, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + + // Rank 2 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, 0, 0, + 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 3 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ( + compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, 0, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 4 + { + auto sub_dec = Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), + sub_dec.extent(0) * sub_dec.extent(1) * sub_dec.extent(2) * + sub_dec.extent(3) * magic); + + auto sub_inc = Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 5 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, 0, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 6 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 7 + { + auto sub_dec = + Kokkos::subview(a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = + Kokkos::subview(a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, 0); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } + reset_view(space, a_decreasing, 0); + reset_view(space, a_increasing, 0); + space.fence(); + + // Rank 8 + { + auto sub_dec = Kokkos::subview( + a_decreasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_dec, magic)); + EXPECT_EQ(compute_overall_sum(space, a_decreasing), sub_dec.size() * magic); + + auto sub_inc = Kokkos::subview( + a_increasing, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, + Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, std::make_pair(0, 2)); + EXPECT_TRUE(view_fill_test(space, sub_inc, magic)); + EXPECT_EQ(compute_overall_sum(space, a_increasing), sub_inc.size() * magic); + } +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_right) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutRight; + run_test<Layout, Space>(); +} + +TEST(TEST_CATEGORY, view_fill_tests_layout_left) { + using Space = TEST_EXECSPACE; + using Layout = Kokkos::LayoutLeft; + run_test<Layout, Space>(); +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp index d71841eef847a4bf792a91176c42f2627dbc305f..de5ba7da7f280c2f14741c87a725b125215b5c14 100644 --- a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp +++ b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp @@ -19,33 +19,82 @@ namespace Test { -#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR) -#define DIE(EXPR, ARGS, DYNRANK) \ - ASSERT_DEATH( \ - EXPR, \ - "Constructor for Kokkos View 'v_" #ARGS \ - "' has mismatched number of arguments. Number of arguments = " #ARGS \ - " but dynamic rank = " #DYNRANK) +template <int rank, int dynrank, class RankType, std::size_t... Is> +void test_matching_arguments_rank_helper(std::index_sequence<Is...>) { + constexpr int nargs = sizeof...(Is); + using view_type = Kokkos::View<RankType>; + if (nargs == rank || nargs == dynrank) { + { // does not throw + view_type v("v", ((Is * 0) + 1)...); + } + { // does not throw + view_type v(nullptr, ((Is * 0) + 1)...); + } + } else { + ASSERT_DEATH( + { view_type v("v", ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'v' has mismatched number of arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + ASSERT_DEATH( + { view_type v(nullptr, ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'UNMANAGED' has mismatched number of " + "arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + } +} + +template <int rank, int dynrank, template <int> class RankType> +void test_matching_arguments_rank() { + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<0>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<1>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<2>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<3>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<4>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<5>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<6>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<7>()); + test_matching_arguments_rank_helper<rank, dynrank, + typename RankType<rank>::type>( + std::make_index_sequence<8>()); +} -#define PARAM_0 -#define PARAM_1 1 -#define PARAM_2 1, 1 -#define PARAM_3 1, 1, 1 -#define PARAM_4 1, 1, 1, 1 -#define PARAM_5 1, 1, 1, 1, 1 -#define PARAM_6 1, 1, 1, 1, 1, 1 -#define PARAM_7 1, 1, 1, 1, 1, 1, 1 +template <int rank> +struct DynamicRank { + using type = typename DynamicRank<rank - 1>::type*; +}; -#define PARAM_0_RANK 0 -#define PARAM_1_RANK 1 -#define PARAM_2_RANK 2 -#define PARAM_3_RANK 3 -#define PARAM_4_RANK 4 -#define PARAM_5_RANK 5 -#define PARAM_6_RANK 6 -#define PARAM_7_RANK 7 +template <> +struct DynamicRank<0> { + using type = int; +}; -using DType = int; +#ifdef KOKKOS_COMPILER_NVHPC +#define VIEW_CTOR_TEST_UNREACHABLE() __builtin_unreachable() +#else +#define VIEW_CTOR_TEST_UNREACHABLE() static_assert(true) +#endif // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until // Kokkos::abort() aborts properly on that backend @@ -53,348 +102,126 @@ using DType = int; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType *; - using DType_2 = DType **; - using DType_3 = DType ***; - using DType_4 = DType ****; - using DType_5 = DType *****; - using DType_6 = DType ******; - using DType_7 = DType *******; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 1 - DIE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 1); - DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 2, dynamic = 2 - DIE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 2); - DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 2); - } - - { - // test View parameters for View dim = 3, dynamic = 3 - DIE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 3); - DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 3); - } - - { - // test View parameters for View dim = 4, dynamic = 4 - DIE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 4); - DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 4); - } - - { - // test View parameters for View dim = 5, dynamic = 5 - DIE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 5); - DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 5); - } +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + GTEST_SKIP() << "only enforced when debug bound checks is enabled"; + VIEW_CTOR_TEST_UNREACHABLE(); +#endif + + test_matching_arguments_rank<0, 0, DynamicRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 1, DynamicRank>(); // dim = 1, dynamic = 1 + test_matching_arguments_rank<2, 2, DynamicRank>(); // dim = 2, dynamic = 2 + test_matching_arguments_rank<3, 3, DynamicRank>(); // dim = 3, dynamic = 3 + test_matching_arguments_rank<4, 4, DynamicRank>(); // dim = 4, dynamic = 4 + test_matching_arguments_rank<5, 5, DynamicRank>(); // dim = 5, dynamic = 5 + test_matching_arguments_rank<6, 6, DynamicRank>(); // dim = 6, dynamic = 6 + test_matching_arguments_rank<7, 7, DynamicRank>(); // dim = 7, dynamic = 7 + test_matching_arguments_rank<8, 8, DynamicRank>(); // dim = 8, dynamic = 8 +} - { - // test View parameters for View dim = 6, dynamic = 6 - DIE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 6); - DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 6); - } +template <int rank> +struct StaticRank { + using type = typename StaticRank<rank - 1>::type[1]; +}; - { - // test View parameters for View dim = 7, dynamic = 7 - DIE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 7); - DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 7); - DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 7); - DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 7); - DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 7); - DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 7); - DIE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 7); - LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 7); - } -} +template <> +struct StaticRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType[1][1]; - using DType_3 = DType[1][1][1]; - using DType_4 = DType[1][1][1][1]; - using DType_5 = DType[1][1][1][1][1]; - using DType_6 = DType[1][1][1][1][1][1]; - using DType_7 = DType[1][1][1][1][1][1][1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 0 - LIVE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 0); - LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 3, dynamic = 0 - LIVE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 0); - LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 0); - } +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + GTEST_SKIP() << "only enforced when debug bound checks is enabled"; + VIEW_CTOR_TEST_UNREACHABLE(); +#endif + + test_matching_arguments_rank<0, 0, StaticRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, StaticRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 0, StaticRank>(); // dim = 2, dynamic = 0 + test_matching_arguments_rank<3, 0, StaticRank>(); // dim = 3, dynamic = 0 + test_matching_arguments_rank<4, 0, StaticRank>(); // dim = 4, dynamic = 0 + test_matching_arguments_rank<5, 0, StaticRank>(); // dim = 5, dynamic = 0 + test_matching_arguments_rank<6, 0, StaticRank>(); // dim = 6, dynamic = 0 + test_matching_arguments_rank<7, 0, StaticRank>(); // dim = 7, dynamic = 0 + test_matching_arguments_rank<8, 0, StaticRank>(); // dim = 8, dynamic = 0 +} - { - // test View parameters for View dim = 4, dynamic = 0 - LIVE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 0); - LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 0); - } +template <int rank> +struct MixedRank { + using type = typename DynamicRank<rank - 1>::type[1]; +}; - { - // test View parameters for View dim = 5, dynamic = 0 - LIVE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 0); - LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 6, dynamic = 0 - LIVE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 0); - LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 7, dynamic = 0 - LIVE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 0); - LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 0); - } -} +template <> +struct MixedRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType * [1]; - using DType_3 = DType * * [1]; - using DType_4 = DType ** * [1]; - using DType_5 = DType *** * [1]; - using DType_6 = DType **** * [1]; - using DType_7 = DType ***** * [1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View<DType_0> v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View<DType_0> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_0> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_0> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_0> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_0> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_0> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_0> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View<DType_1> v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View<DType_1> v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View<DType_1> v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View<DType_1> v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View<DType_1> v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View<DType_1> v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View<DType_1> v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View<DType_1> v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 1 - DIE({ Kokkos::View<DType_2> v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View<DType_2> v_1("v_1", PARAM_1); }, 1, 1); - LIVE({ Kokkos::View<DType_2> v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View<DType_2> v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View<DType_2> v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View<DType_2> v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View<DType_2> v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View<DType_2> v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 3, dynamic = 2 - DIE({ Kokkos::View<DType_3> v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View<DType_3> v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View<DType_3> v_2("v_2", PARAM_2); }, 2, 2); - LIVE({ Kokkos::View<DType_3> v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View<DType_3> v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View<DType_3> v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View<DType_3> v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View<DType_3> v_7("v_7", PARAM_7); }, 7, 2); - } +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + GTEST_SKIP() << "only enforced when debug bound checks is enabled"; + VIEW_CTOR_TEST_UNREACHABLE(); +#endif + + test_matching_arguments_rank<0, 0, MixedRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, MixedRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 1, MixedRank>(); // dim = 2, dynamic = 1 + test_matching_arguments_rank<3, 2, MixedRank>(); // dim = 3, dynamic = 2 + test_matching_arguments_rank<4, 3, MixedRank>(); // dim = 4, dynamic = 3 + test_matching_arguments_rank<5, 4, MixedRank>(); // dim = 5, dynamic = 4 + test_matching_arguments_rank<6, 5, MixedRank>(); // dim = 6, dynamic = 5 + test_matching_arguments_rank<7, 6, MixedRank>(); // dim = 7, dynamic = 6 + test_matching_arguments_rank<8, 7, MixedRank>(); // dim = 8, dynamic = 7 +} - { - // test View parameters for View dim = 4, dynamic = 3 - DIE({ Kokkos::View<DType_4> v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View<DType_4> v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View<DType_4> v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View<DType_4> v_3("v_3", PARAM_3); }, 3, 3); - LIVE({ Kokkos::View<DType_4> v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View<DType_4> v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View<DType_4> v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View<DType_4> v_7("v_7", PARAM_7); }, 7, 3); - } +#define CHECK_DEATH(EXPR) \ + ASSERT_DEATH(EXPR, \ + "The specified run-time extent for Kokkos::View 'v' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 5, dynamic = 4 - DIE({ Kokkos::View<DType_5> v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View<DType_5> v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View<DType_5> v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View<DType_5> v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View<DType_5> v_4("v_4", PARAM_4); }, 4, 4); - LIVE({ Kokkos::View<DType_5> v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View<DType_5> v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View<DType_5> v_7("v_7", PARAM_7); }, 7, 4); - } +#define CHECK_DEATH_UNMANAGED(EXPR) \ + ASSERT_DEATH( \ + EXPR, \ + "The specified run-time extent for Kokkos::View 'UNMANAGED' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 6, dynamic = 5 - DIE({ Kokkos::View<DType_6> v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View<DType_6> v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View<DType_6> v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View<DType_6> v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View<DType_6> v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View<DType_6> v_5("v_5", PARAM_5); }, 5, 5); - LIVE({ Kokkos::View<DType_6> v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View<DType_6> v_7("v_7", PARAM_7); }, 7, 5); - } +TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_static_extents) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - { - // test View parameters for View dim = 7, dynamic = 6 - DIE({ Kokkos::View<DType_7> v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View<DType_7> v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View<DType_7> v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View<DType_7> v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View<DType_7> v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View<DType_7> v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View<DType_7> v_6("v_6", PARAM_6); }, 6, 6); - LIVE({ Kokkos::View<DType_7> v_7("v_7", PARAM_7); }, 7, 6); - } +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + GTEST_SKIP() << "only enforced when debug bound checks is enabled"; + VIEW_CTOR_TEST_UNREACHABLE(); +#endif + + // clang-format off + CHECK_DEATH({ Kokkos::View<int[1]> v("v", 2); }); + CHECK_DEATH({ Kokkos::View<int[1][1]> v("v", 2, 1); }); + CHECK_DEATH({ Kokkos::View<int[1][1][1]> v("v", 2, 1, 1); }); + CHECK_DEATH({ Kokkos::View<int[1][1][1][1]> v("v", 2, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1]> v("v", 2, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1][1]> v("v", 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1][1][1]> v("v", 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View<int[1][1][1][1][1][1][1][1]> v("v", 2, 1, 1, 1, 1, 1, 1, 1); }); + + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1]> v(nullptr, 2); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1]> v(nullptr, 2, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1]> v(nullptr, 2, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1]> v(nullptr, 2, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1]> v(nullptr, 2, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1][1]> v(nullptr, 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1][1][1]> v(nullptr, 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View<int[1][1][1][1][1][1][1][1]> v(nullptr, 2, 1, 1, 1, 1, 1, 1, 1); }); + // clang-format on } -#endif // KOKKOS_ENABLE_OPENMPTARGET - -#undef PARAM_0 -#undef PARAM_1 -#undef PARAM_2 -#undef PARAM_3 -#undef PARAM_4 -#undef PARAM_5 -#undef PARAM_6 -#undef PARAM_7 -#undef PARAM_0_RANK -#undef PARAM_1_RANK -#undef PARAM_2_RANK -#undef PARAM_3_RANK -#undef PARAM_4_RANK -#undef PARAM_5_RANK -#undef PARAM_6_RANK -#undef PARAM_7_RANK +#undef CHECK_DEATH +#undef CHECK_DEATH_UNMANAGED +#endif // KOKKOS_ENABLE_OPENMPTARGET -#undef DType +#undef VIEW_CTOR_TEST_UNREACHABLE -#undef LIVE -#undef DIE } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewCtorProp.hpp b/packages/kokkos/core/unit_test/TestViewCtorProp.hpp new file mode 100644 index 0000000000000000000000000000000000000000..75f887f36c4fdbd43c0984d8e6520f4cde1d70fd --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewCtorProp.hpp @@ -0,0 +1,95 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace { + +using vcp_empty_t = Kokkos::Impl::ViewCtorProp<>; +using vcp_label_base_t = Kokkos::Impl::ViewCtorProp<void, std::string>; +using vcp_label_t = Kokkos::Impl::ViewCtorProp<std::string>; + +// Check traits of Kokkos::Impl::ViewCtorProp<>. +TEST(TEST_CATEGORY, vcp_empty_traits) { + // Check that the empty view constructor properties class is default + // constructible. This is needed for calls of Kokkos::view_alloc(). + static_assert(std::is_default_constructible_v<vcp_empty_t>); + static_assert(std::is_same_v<decltype(Kokkos::view_alloc()), vcp_empty_t>); +} + +// Check Kokkos::Impl::is_view_label. +TEST(TEST_CATEGORY, is_view_label) { + static_assert(Kokkos::Impl::is_view_label<std::string>::value); + + static_assert(Kokkos::Impl::is_view_label<const char[3]>::value); + static_assert(Kokkos::Impl::is_view_label<char[3]>::value); + + // A char* is not a label. Thus, a label is distinguished from a pointer type. + static_assert(!Kokkos::Impl::is_view_label<char*>::value); +} + +// Check traits of base class Kokkos::Impl::ViewCtorProp<void, std::string>. +TEST(TEST_CATEGORY, vcp_label_base_traits) { + static_assert(std::is_same_v<typename vcp_label_base_t::type, std::string>); + + // Check that the base class is default constructible. The default constructor + // may be called by the copy constructor of derived classes, such as when + // copy constructing a view constructor properties object from another view + // constructor properties object that holds fewer properties. + static_assert(std::is_default_constructible_v<vcp_label_base_t>); +} + +// Check traits of derived class Kokkos::Impl::ViewCtorProp<std::string>. +TEST(TEST_CATEGORY, vcp_label_traits) { + static_assert(std::is_base_of_v<vcp_label_base_t, vcp_label_t>); + + static_assert(vcp_label_t::has_label); + + // Check that the derived class is not default constructible. It is a design + // choice to not allow the default constructor to be called. + static_assert(!std::is_default_constructible_v<vcp_label_t>); +} + +// Check that Kokkos::view_alloc perfect forwards a label passed by +// rvalue reference, and check that the constructor +// of Kokkos::Impl::ViewCtorProp<std::string> moves this label. +TEST(TEST_CATEGORY, view_alloc_can_perfect_forward_label) { + std::string label("our label"); + + auto prop = Kokkos::view_alloc(std::move(label)); + + ASSERT_TRUE(label.empty()); + ASSERT_EQ(Kokkos::Impl::get_property<Kokkos::Impl::LabelTag>(prop), + "our label"); +} + +// Check the copy constructor of Kokkos::Impl::ViewCtorProp<std::string>. +TEST(TEST_CATEGORY, vcp_label_copy_constructor) { + // Copy construction from a view constructor properties object with a label. + static_assert(std::is_copy_constructible_v<vcp_label_t>); + + vcp_label_t prop = Kokkos::view_alloc("our label"); + vcp_label_t prop_copy(prop); + + ASSERT_EQ(Kokkos::Impl::get_property<Kokkos::Impl::LabelTag>(prop), + "our label"); + ASSERT_EQ(Kokkos::Impl::get_property<Kokkos::Impl::LabelTag>(prop_copy), + "our label"); +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp index 1ab28208f310a2034ad395a5f7b16be0fee67c6c..588d407bd1bf0f92674ba1175b29c4d4c166cc21 100644 --- a/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp +++ b/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp @@ -44,7 +44,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -74,11 +74,11 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same<CommonViewValueType, double>::value), true); - ASSERT_EQ( - (std::is_same<typename decltype(view_alloc_arg)::scalar_array_type, - CommonViewValueType>::value), - true); + ASSERT_EQ((std::is_same_v<CommonViewValueType, double>), true); + ASSERT_EQ((std::is_same_v< + typename decltype(view_alloc_arg)::scalar_array_type, + CommonViewValueType>), + true); #if 0 // debug output for ( int i = 0; i < N0*N1; ++i ) { @@ -87,7 +87,7 @@ struct TestViewCtorProp_EmbeddedDim { printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same<CommonViewValueType, int>::value), true); + ASSERT_EQ((std::is_same_v<CommonViewValueType, int>), true); } } diff --git a/packages/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp b/packages/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b156b72860ec9f0654166e99681f664368499af0 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp @@ -0,0 +1,55 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace { + +template <class T> +void test_empty_view_runtime_unmanaged() { + T d{}; + auto* p = reinterpret_cast<T*>(0xABADBABE); + + (void)Kokkos::View<T>(p); + (void)Kokkos::View<T>(&d); + (void)Kokkos::View<T>(nullptr); + (void)Kokkos::View<T>(NULL); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View<T>(0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View<T*>(p, 0); + (void)Kokkos::View<T*>(&d, 0); + (void)Kokkos::View<T*>(nullptr, 0); + (void)Kokkos::View<T*>(NULL, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View<T*>(0, 0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View<T**>(p, 0, 0); + (void)Kokkos::View<T**>(&d, 0, 0); + (void)Kokkos::View<T**>(nullptr, 0, 0); + (void)Kokkos::View<T**>(NULL, 0, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View<T**>(0, 0, 0); // NOLINT(modernize-use-nullptr) +} + +TEST(TEST_CATEGORY, view_empty_runtime_unmanaged) { + test_empty_view_runtime_unmanaged<float>(); + test_empty_view_runtime_unmanaged<const double>(); + test_empty_view_runtime_unmanaged<int>(); + test_empty_view_runtime_unmanaged<char>(); + test_empty_view_runtime_unmanaged<const char>(); +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp index 0221569b226938434533eb5dd21e54dfa81bbd56..05c501a6e7f909b4feb3db307da818bb38ccd2cd 100644 --- a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp +++ b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp @@ -16,6 +16,8 @@ #include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + namespace Test { namespace Impl { template <class ViewTypeDst, class ViewTypeSrc> @@ -36,8 +38,7 @@ struct TestAssignability { static void try_assign( ViewTypeDst&, ViewTypeSrc&, std::enable_if_t<!MappingType::is_assignable>* = nullptr) { - Kokkos::Impl::throw_runtime_exception( - "TestAssignability::try_assign: Unexpected call path"); + FAIL() << "TestAssignability::try_assign: Unexpected call path"; } template <class... Dimensions> @@ -49,20 +50,15 @@ struct TestAssignability { Kokkos::is_always_assignable<ViewTypeDst, ViewTypeSrc>::value; bool is_assignable = Kokkos::is_assignable(dst, src); - // Print out if there is an error with typeid so you can just filter the - // output with c++filt -t to see which assignment causes the error. - if (is_always_assignable != always || is_assignable != sometimes) - printf( - "is_always_assignable: %i (%i), is_assignable: %i (%i) [ %s ] to [ " - "%s ]\n", - is_always_assignable ? 1 : 0, always ? 1 : 0, is_assignable ? 1 : 0, - sometimes ? 1 : 0, typeid(ViewTypeSrc).name(), - typeid(ViewTypeDst).name()); if (sometimes) { - ASSERT_NO_THROW(try_assign<mapping_type>(dst, src)); + try_assign<mapping_type>(dst, src); } - ASSERT_EQ(always, is_always_assignable); - ASSERT_EQ(sometimes, is_assignable); + ASSERT_EQ(always, is_always_assignable) + << Kokkos::Impl::TypeInfo<ViewTypeSrc>::name() << " to " + << Kokkos::Impl::TypeInfo<ViewTypeDst>::name(); + ASSERT_EQ(sometimes, is_assignable) + << Kokkos::Impl::TypeInfo<ViewTypeSrc>::name() << " to " + << Kokkos::Impl::TypeInfo<ViewTypeDst>::name(); } }; @@ -90,18 +86,18 @@ TEST(TEST_CATEGORY, view_is_assignable) { View<int[10], left, d_exec>>::test(false, false); Impl::TestAssignability<View<int**, left, d_exec>, View<int**, left, d_exec>>::test(true, true, 10, 10); - Impl::TestAssignability<View<int * [10], left, d_exec>, + Impl::TestAssignability<View<int* [10], left, d_exec>, View<int**, left, d_exec>>::test(false, true, 10, 10); - Impl::TestAssignability<View<int * [5], left, d_exec>, + Impl::TestAssignability<View<int* [5], left, d_exec>, View<int**, left, d_exec>>::test(false, false, 10, 10); Impl::TestAssignability<View<int**, left, d_exec>, - View<int * [10], left, d_exec>>::test(true, true, 10); - Impl::TestAssignability<View<int * [10], left, d_exec>, - View<int * [10], left, d_exec>>::test(true, true, 10); - Impl::TestAssignability<View<int * [5], left, d_exec>, - View<int * [10], left, d_exec>>::test(false, false, - 10); + View<int* [10], left, d_exec>>::test(true, true, 10); + Impl::TestAssignability<View<int* [10], left, d_exec>, + View<int* [10], left, d_exec>>::test(true, true, 10); + Impl::TestAssignability<View<int* [5], left, d_exec>, + View<int* [10], left, d_exec>>::test(false, false, + 10); // Mismatch value_type Impl::TestAssignability<View<int*, left, d_exec>, diff --git a/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp b/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp deleted file mode 100644 index 67308212ee0feb391a1d1bfb0244bc6c6f9d301f..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp +++ /dev/null @@ -1,1756 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include <cstdio> - -#include <gtest/gtest.h> - -#include <Kokkos_Core.hpp> -#include <impl/Kokkos_ViewLayoutTiled.hpp> - -#include <type_traits> -#include <typeinfo> - -namespace Test { - -namespace { - -template <typename ExecSpace> -struct TestViewLayoutTiled { - using Scalar = double; - - static constexpr int T0 = 2; - static constexpr int T1 = 4; - static constexpr int T2 = 4; - static constexpr int T3 = 2; - static constexpr int T4 = 2; - static constexpr int T5 = 2; - static constexpr int T6 = 2; - static constexpr int T7 = 2; - - // Rank 2 - using LayoutLL_2D_2x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, - Kokkos::Iterate::Left, T0, T1>; - using LayoutRL_2D_2x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, - Kokkos::Iterate::Left, T0, T1>; - using LayoutLR_2D_2x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, - Kokkos::Iterate::Right, T0, T1>; - using LayoutRR_2D_2x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, - Kokkos::Iterate::Right, T0, T1>; - - // Rank 3 - using LayoutLL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, - Kokkos::Iterate::Left, T0, T1, T2>; - using LayoutRL_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, - Kokkos::Iterate::Left, T0, T1, T2>; - using LayoutLR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, - Kokkos::Iterate::Right, T0, T1, T2>; - using LayoutRR_3D_2x4x4 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, - Kokkos::Iterate::Right, T0, T1, T2>; - - // Rank 4 - using LayoutLL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, - Kokkos::Iterate::Left, T0, T1, T2, T3>; - using LayoutRL_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, - Kokkos::Iterate::Left, T0, T1, T2, T3>; - using LayoutLR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, - Kokkos::Iterate::Right, T0, T1, T2, T3>; - using LayoutRR_4D_2x4x4x2 = - Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, - Kokkos::Iterate::Right, T0, T1, T2, T3>; - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_2d(const int, const int) { -#else - static void test_view_layout_tiled_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - // Test create_mirror_view, deep_copy - // Create LL View - { - using ViewType = - typename Kokkos::View<Scalar**, LayoutLL_2D_2x4, ExecSpace>; - ViewType v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } - - // Create RL View - { - using ViewType = - typename Kokkos::View<Scalar**, LayoutRL_2D_2x4, ExecSpace>; - Kokkos::View<Scalar**, LayoutRL_2D_2x4, ExecSpace> v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i + j * T0) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = - typename Kokkos::View<Scalar**, LayoutLR_2D_2x4, ExecSpace>; - Kokkos::View<Scalar**, LayoutLR_2D_2x4, ExecSpace> v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti + tj * NT0) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = - typename Kokkos::View<Scalar**, LayoutRR_2D_2x4, ExecSpace>; - Kokkos::View<Scalar**, LayoutRR_2D_2x4, ExecSpace> v("v", N0, N1); - - typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); - - // Initialize host-view - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - hv(ti * T0 + i, tj * T1 + j) = - (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - // copy to device - Kokkos::deep_copy(v, hv); - - Kokkos::MDRangePolicy< - Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 2 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int ti, const int tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { - v(ti * T0 + i, tj * T1 + j) += 1; - } - } - } - }); - - Kokkos::deep_copy(hv, v); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(hv, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { - ++counter_subview; - } - if (tile_subview(i, j) != - ((ti * NT1 + tj) * FT + (i * T1 + j) + 1)) { - ++counter_inc; - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_2d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_3d(const int, const int, const int) { -#else - static void test_view_layout_tiled_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Create LL View - { - using ViewType = Kokkos::View<Scalar***, LayoutLL_3D_2x4x4, ExecSpace>; - Kokkos::View<Scalar***, LayoutLL_3D_2x4x4, ExecSpace> dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View<Scalar***, LayoutRL_3D_2x4x4, ExecSpace>; - Kokkos::View<Scalar***, LayoutRL_3D_2x4x4, ExecSpace> dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View<Scalar***, LayoutLR_3D_2x4x4, ExecSpace>; - Kokkos::View<Scalar***, LayoutLR_3D_2x4x4, ExecSpace> dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View<Scalar***, LayoutRR_3D_2x4x4, ExecSpace>; - Kokkos::View<Scalar***, LayoutRR_3D_2x4x4, ExecSpace> dv("dv", N0, N1, - N2); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 3 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k) { - dv(i, j, k) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter_subview; - } - if (tile_subview(i, j, k) != - ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_3d - -#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - static void test_view_layout_tiled_4d(const int, const int, const int, - const int){ -#else - static void test_view_layout_tiled_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Create LL View - { - using ViewType = Kokkos::View<Scalar****, LayoutLL_4D_2x4x4x2, ExecSpace>; - Kokkos::View<Scalar****, LayoutLL_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RL View - { - using ViewType = Kokkos::View<Scalar****, LayoutRL_4D_2x4x4x2, ExecSpace>; - Kokkos::View<Scalar****, LayoutRL_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RL", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create LR View - { - using ViewType = Kokkos::View<Scalar****, LayoutLR_4D_2x4x4x2, ExecSpace>; - Kokkos::View<Scalar****, LayoutLR_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 LR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope - - // Create RR View - { - using ViewType = Kokkos::View<Scalar****, LayoutRR_4D_2x4x4x2, ExecSpace>; - Kokkos::View<Scalar****, LayoutRR_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, - N2, N3); - - typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); - - // Initialize on host - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - // copy to device - Kokkos::deep_copy(dv, v); - - Kokkos::MDRangePolicy< - Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - ExecSpace> - mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); - - // iterate by tile - Kokkos::parallel_for( - "ViewTile rank 4 RR", mdrangepolicy, - KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { - dv(i, j, k, l) += 1; - }); - - Kokkos::deep_copy(v, dv); - - long counter_subview = 0; - long counter_inc = 0; - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter_subview; - } - if (tile_subview(i, j, k, l) != - ((ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { - ++counter_inc; - } - } - } - } - } - } - } - } - } - ASSERT_EQ(counter_subview, long(0)); - ASSERT_EQ(counter_inc, long(0)); - } // end scope -#endif - } // end test_view_layout_tiled_4d - - static void test_view_layout_tiled_subtile_2d(const int N0, const int N1) { - const int FT = T0 * T1; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - - // Counter to check for errors at the end - long counter[4] = {0}; - - // Create LL View - { - Kokkos::View<Scalar**, LayoutLL_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i + j * T0); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View<Scalar**, LayoutRL_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i + j * T0); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i + j * T0) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View<Scalar**, LayoutLR_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i * T1 + j); - } - } - } - } - - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti + tj * NT0) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; -#endif - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View<Scalar**, LayoutRR_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i * T1 + j); - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j - << std::endl; - std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," - << j << " v = " << v(ti * T0 + i, tj * T1 + j) - << " flat idx = " - << (ti * NT1 + tj) * FT + (i * T1 + j) << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j) - << std::endl; - std::cout << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - } // end test_view_layout_tiled_subtile_2d - - static void test_view_layout_tiled_subtile_3d(const int N0, const int N1, - const int N2) { - const int FT = T0 * T1 * T2; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View<Scalar***, LayoutLL_3D_2x4x4, Kokkos::HostSpace> v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View<Scalar***, LayoutRL_3D_2x4x4, Kokkos::HostSpace> v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i + j * T0 + k * T0 * T1) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View<Scalar***, LayoutLR_3D_2x4x4, Kokkos::HostSpace> v("v", N0, - N1, N2); - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View<Scalar***, LayoutRR_3D_2x4x4, Kokkos::HostSpace> v("v", N0, - N1, N2); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = - (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k); - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - if (tile_subview(i, j, k) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," - << tj * T1 + j << "," << tk * T2 + k << std::endl; - std::cout - << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk - << "," << i << "," << j << "," << k - << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) - << " flat idx = " - << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + - (i * T1 * T2 + j * T2 + k) - << std::endl; - std::cout << "subview_tile output = " << tile_subview(i, j, k) - << std::endl; - std::cout - << "subview tile rank = " << Kokkos::rank(tile_subview) - << std::endl; -#endif - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_3d - - static void test_view_layout_tiled_subtile_4d(const int N0, const int N1, - const int N2, const int N3) { - const int FT = T0 * T1 * T2 * T3; - - const int NT0 = int(std::ceil(N0 / T0)); - const int NT1 = int(std::ceil(N1 / T1)); - const int NT2 = int(std::ceil(N2 / T2)); - const int NT3 = int(std::ceil(N3 / T3)); - - // Counter to check for errors at the end - long counter[4] = {0}; - // Create LL View - { - Kokkos::View<Scalar****, LayoutLL_4D_2x4x4x2, Kokkos::HostSpace> v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[0]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * N0 * N1 + - tl * N0 * N1 * N2) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RL View - { - Kokkos::View<Scalar****, LayoutRL_4D_2x4x4x2, Kokkos::HostSpace> v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * - FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int l = 0; l < T3; ++l) { - for (int k = 0; k < T2; ++k) { - for (int j = 0; j < T1; ++j) { - for (int i = 0; i < T0; ++i) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[1]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + - tl) * FT + - (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create LR View - { - Kokkos::View<Scalar****, LayoutLR_4D_2x4x4x2, Kokkos::HostSpace> v( - "v", N0, N1, N2, N3); - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int tl = 0; tl < NT3; ++tl) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tj = 0; tj < NT1; ++tj) { - for (int ti = 0; ti < NT0; ++ti) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[2]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti + tj * NT0 + tk * NT0 * NT1 + - tl * NT0 * NT1 * NT2) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - - // Create RR View - { - Kokkos::View<Scalar****, LayoutRR_4D_2x4x4x2, Kokkos::HostSpace> v( - "v", N0, N1, N2, N3); - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = - (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * - FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); - } - } - } - } - } - } - } - } - - for (int ti = 0; ti < NT0; ++ti) { - for (int tj = 0; tj < NT1; ++tj) { - for (int tk = 0; tk < NT2; ++tk) { - for (int tl = 0; tl < NT3; ++tl) { - auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); - for (int i = 0; i < T0; ++i) { - for (int j = 0; j < T1; ++j) { - for (int k = 0; k < T2; ++k) { - for (int l = 0; l < T3; ++l) { - if (tile_subview(i, j, k, l) != - v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l)) { - ++counter[3]; - } -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i - << "," << tj * T1 + j << "," << tk * T2 + k - << "," << tl * T3 + l << std::endl; - std::cout - << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk - << "," << tl << "," - << " i,j,k,l: " << i << "," << j << "," << k << "," - << l << " v = " - << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, - tl * T3 + l) - << " flat idx = " - << (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + - tl) * FT + - (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) - << std::endl; - std::cout << "subview_tile output = " - << tile_subview(i, j, k, l) << std::endl; - std::cout << "subview tile rank = " - << Kokkos::rank(tile_subview) << std::endl; -#endif - } - } - } - } - } - } - } - } - } // end scope - -#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT - std::cout << "subview_tile vs view errors:\n" - << " LL: " << counter[0] << " RL: " << counter[1] - << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; -#endif - - ASSERT_EQ(counter[0], long(0)); - ASSERT_EQ(counter[1], long(0)); - ASSERT_EQ(counter[2], long(0)); - ASSERT_EQ(counter[3], long(0)); - - } // end test_view_layout_tiled_subtile_4d - -}; // end TestViewLayoutTiled struct - -} // namespace - -TEST(TEST_CATEGORY, view_layouttiled) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_2d(4, 12); - TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_3d(4, 12, 16); - TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_4d(4, 12, 16, 12); -} -TEST(TEST_CATEGORY, view_layouttiled_subtile) { - // These two examples are iterating by tile, then within a tile - not by - // extents If N# is not a power of two, but want to iterate by tile then - // within a tile, need to check that mapped index is within extent - TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_subtile_2d(4, 12); - TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_subtile_3d(4, 12, - 16); - TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_subtile_4d( - 4, 12, 16, 12); -} -} // namespace Test - -#undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp index 9173f0d4316e4d1422268316fce4d76a731de8da..443277f6537218a491f98c8904e491c51b514cfb 100644 --- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp +++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -73,67 +73,67 @@ void test_view_mapping() { ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); #endif - static_assert(int(dim_0::rank) == int(0), ""); - static_assert(int(dim_0::rank_dynamic) == int(0), ""); - static_assert(int(dim_0::ArgN0) == 1, ""); - static_assert(int(dim_0::ArgN1) == 1, ""); - static_assert(int(dim_0::ArgN2) == 1, ""); - - static_assert(int(dim_s2::rank) == int(1), ""); - static_assert(int(dim_s2::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2::ArgN0) == 2, ""); - static_assert(int(dim_s2::ArgN1) == 1, ""); - - static_assert(int(dim_s2_s3::rank) == int(2), ""); - static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3::ArgN2) == 1, ""); - - static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); - static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); - - static_assert(int(dim_s0::rank) == int(1), ""); - static_assert(int(dim_s0::rank_dynamic) == int(1), ""); - - static_assert(int(dim_s0_s3::rank) == int(2), ""); - static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3::ArgN1) == 3, ""); - - static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); - static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); - - static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); - static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); - static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + static_assert(int(dim_0::rank) == int(0)); + static_assert(int(dim_0::rank_dynamic) == int(0)); + static_assert(int(dim_0::ArgN0) == 1); + static_assert(int(dim_0::ArgN1) == 1); + static_assert(int(dim_0::ArgN2) == 1); + + static_assert(int(dim_s2::rank) == int(1)); + static_assert(int(dim_s2::rank_dynamic) == int(0)); + static_assert(int(dim_s2::ArgN0) == 2); + static_assert(int(dim_s2::ArgN1) == 1); + + static_assert(int(dim_s2_s3::rank) == int(2)); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3::ArgN0) == 2); + static_assert(int(dim_s2_s3::ArgN1) == 3); + static_assert(int(dim_s2_s3::ArgN2) == 1); + + static_assert(int(dim_s2_s3_s4::rank) == int(3)); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1); + + static_assert(int(dim_s0::rank) == int(1)); + static_assert(int(dim_s0::rank_dynamic) == int(1)); + + static_assert(int(dim_s0_s3::rank) == int(2)); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3::ArgN0) == 0); + static_assert(int(dim_s0_s3::ArgN1) == 3); + + static_assert(int(dim_s0_s3_s4::rank) == int(3)); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s4::rank) == int(3)); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2)); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s0::rank) == int(3)); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3)); + + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4)); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4)); + + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5)); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8)); dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); @@ -514,69 +514,61 @@ void test_view_mapping() { { using namespace Kokkos::Impl; - static_assert(rank_dynamic<>::value == 0, ""); - static_assert(rank_dynamic<1>::value == 0, ""); - static_assert(rank_dynamic<0>::value == 1, ""); - static_assert(rank_dynamic<0, 1>::value == 1, ""); - static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + static_assert(rank_dynamic<>::value == 0); + static_assert(rank_dynamic<1>::value == 0); + static_assert(rank_dynamic<0>::value == 1); + static_assert(rank_dynamic<0, 1>::value == 1); + static_assert(rank_dynamic<0, 0, 1>::value == 2); } { using namespace Kokkos::Impl; using a_int_r1 = ViewArrayAnalysis<int[]>; - using a_int_r5 = ViewArrayAnalysis<int* * [4][5][6]>; + using a_int_r5 = ViewArrayAnalysis<int** [4][5][6]>; using a_const_int_r1 = ViewArrayAnalysis<const int[]>; - using a_const_int_r5 = ViewArrayAnalysis<const int* * [4][5][6]>; + using a_const_int_r5 = ViewArrayAnalysis<const int** [4][5][6]>; - static_assert(a_int_r1::dimension::rank == 1, ""); - static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_int_r1::dimension::rank == 1); + static_assert(a_int_r1::dimension::rank_dynamic == 1); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 4); + static_assert(a_int_r5::dimension::ArgN3 == 5); + static_assert(a_int_r5::dimension::ArgN4 == 6); + static_assert(a_int_r5::dimension::ArgN5 == 1); static_assert( - std::is_same<typename a_int_r1::dimension, ViewDimension<0> >::value, - ""); - static_assert( - std::is_same<typename a_int_r1::non_const_value_type, int>::value, ""); + std::is_same_v<typename a_int_r1::dimension, ViewDimension<0> >); + static_assert(std::is_same_v<typename a_int_r1::non_const_value_type, int>); - static_assert(a_const_int_r1::dimension::rank == 1, ""); - static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(std::is_same<typename a_const_int_r1::dimension, - ViewDimension<0> >::value, - ""); + static_assert(a_const_int_r1::dimension::rank == 1); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1); static_assert( - std::is_same<typename a_const_int_r1::non_const_value_type, int>::value, - ""); - - static_assert(a_const_int_r5::dimension::rank == 5, ""); - static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); - - static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); - - static_assert(std::is_same<typename a_const_int_r5::dimension, - ViewDimension<0, 0, 4, 5, 6> >::value, - ""); + std::is_same_v<typename a_const_int_r1::dimension, ViewDimension<0> >); static_assert( - std::is_same<typename a_const_int_r5::non_const_value_type, int>::value, - ""); - - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); - static_assert(std::is_same<typename a_int_r5::dimension, - ViewDimension<0, 0, 4, 5, 6> >::value, - ""); + std::is_same_v<typename a_const_int_r1::non_const_value_type, int>); + + static_assert(a_const_int_r5::dimension::rank == 5); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2); + + static_assert(a_const_int_r5::dimension::ArgN0 == 0); + static_assert(a_const_int_r5::dimension::ArgN1 == 0); + static_assert(a_const_int_r5::dimension::ArgN2 == 4); + static_assert(a_const_int_r5::dimension::ArgN3 == 5); + static_assert(a_const_int_r5::dimension::ArgN4 == 6); + static_assert(a_const_int_r5::dimension::ArgN5 == 1); + + static_assert(std::is_same_v<typename a_const_int_r5::dimension, + ViewDimension<0, 0, 4, 5, 6> >); static_assert( - std::is_same<typename a_int_r5::non_const_value_type, int>::value, ""); + std::is_same_v<typename a_const_int_r5::non_const_value_type, int>); + + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 2); + static_assert(std::is_same_v<typename a_int_r5::dimension, + ViewDimension<0, 0, 4, 5, 6> >); + static_assert(std::is_same_v<typename a_int_r5::non_const_value_type, int>); } { @@ -585,17 +577,16 @@ void test_view_mapping() { using t_i4 = int[4]; // Dimensions of t_i4 are appended to the multdimensional array. - using a_int_r5 = ViewArrayAnalysis<t_i4** * [3]>; - - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 0, ""); - static_assert(a_int_r5::dimension::ArgN3 == 3, ""); - static_assert(a_int_r5::dimension::ArgN4 == 4, ""); - static_assert( - std::is_same<typename a_int_r5::non_const_value_type, int>::value, ""); + using a_int_r5 = ViewArrayAnalysis<t_i4*** [3]>; + + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 3); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 0); + static_assert(a_int_r5::dimension::ArgN3 == 3); + static_assert(a_int_r5::dimension::ArgN4 == 4); + static_assert(std::is_same_v<typename a_int_r5::non_const_value_type, int>); } { @@ -603,74 +594,55 @@ void test_view_mapping() { using a_const_int_r1 = ViewDataAnalysis<const int[], void>; - static_assert(std::is_void<typename a_const_int_r1::specialize>::value, ""); - static_assert(std::is_same<typename a_const_int_r1::dimension, - Kokkos::Impl::ViewDimension<0> >::value, - ""); + static_assert(std::is_void_v<typename a_const_int_r1::specialize>); + static_assert(std::is_same_v<typename a_const_int_r1::dimension, + Kokkos::Impl::ViewDimension<0> >); + static_assert(std::is_same_v<typename a_const_int_r1::type, const int*>); static_assert( - std::is_same<typename a_const_int_r1::type, const int*>::value, ""); - static_assert( - std::is_same<typename a_const_int_r1::value_type, const int>::value, - ""); + std::is_same_v<typename a_const_int_r1::value_type, const int>); - static_assert(std::is_same<typename a_const_int_r1::scalar_array_type, - const int*>::value, - ""); static_assert( - std::is_same<typename a_const_int_r1::const_type, const int*>::value, - ""); - static_assert(std::is_same<typename a_const_int_r1::const_value_type, - const int>::value, - ""); - static_assert(std::is_same<typename a_const_int_r1::const_scalar_array_type, - const int*>::value, - ""); + std::is_same_v<typename a_const_int_r1::scalar_array_type, const int*>); + static_assert( + std::is_same_v<typename a_const_int_r1::const_type, const int*>); + static_assert( + std::is_same_v<typename a_const_int_r1::const_value_type, const int>); static_assert( - std::is_same<typename a_const_int_r1::non_const_type, int*>::value, ""); + std::is_same_v<typename a_const_int_r1::const_scalar_array_type, + const int*>); static_assert( - std::is_same<typename a_const_int_r1::non_const_value_type, int>::value, - ""); + std::is_same_v<typename a_const_int_r1::non_const_type, int*>); + static_assert( + std::is_same_v<typename a_const_int_r1::non_const_value_type, int>); - using a_const_int_r3 = ViewDataAnalysis<const int* * [4], void>; + using a_const_int_r3 = ViewDataAnalysis<const int** [4], void>; - static_assert(std::is_void<typename a_const_int_r3::specialize>::value, ""); + static_assert(std::is_void_v<typename a_const_int_r3::specialize>); - static_assert(std::is_same<typename a_const_int_r3::dimension, - Kokkos::Impl::ViewDimension<0, 0, 4> >::value, - ""); + static_assert(std::is_same_v<typename a_const_int_r3::dimension, + Kokkos::Impl::ViewDimension<0, 0, 4> >); static_assert( - std::is_same<typename a_const_int_r3::type, const int* * [4]>::value, - ""); + std::is_same_v<typename a_const_int_r3::type, const int** [4]>); static_assert( - std::is_same<typename a_const_int_r3::value_type, const int>::value, - ""); - static_assert(std::is_same<typename a_const_int_r3::scalar_array_type, - const int* * [4]>::value, - ""); - static_assert(std::is_same<typename a_const_int_r3::const_type, - const int* * [4]>::value, - ""); - static_assert(std::is_same<typename a_const_int_r3::const_value_type, - const int>::value, - ""); - static_assert(std::is_same<typename a_const_int_r3::const_scalar_array_type, - const int* * [4]>::value, - ""); - static_assert(std::is_same<typename a_const_int_r3::non_const_type, - int* * [4]>::value, - ""); + std::is_same_v<typename a_const_int_r3::value_type, const int>); + static_assert(std::is_same_v<typename a_const_int_r3::scalar_array_type, + const int** [4]>); static_assert( - std::is_same<typename a_const_int_r3::non_const_value_type, int>::value, - ""); + std::is_same_v<typename a_const_int_r3::const_type, const int** [4]>); static_assert( - std::is_same<typename a_const_int_r3::non_const_scalar_array_type, - int* * [4]>::value, - ""); - - // std::cout << "typeid( const int**[4] ).name() = " << typeid( const - // int**[4] ).name() << std::endl; + std::is_same_v<typename a_const_int_r3::const_value_type, const int>); + static_assert( + std::is_same_v<typename a_const_int_r3::const_scalar_array_type, + const int** [4]>); + static_assert( + std::is_same_v<typename a_const_int_r3::non_const_type, int** [4]>); + static_assert( + std::is_same_v<typename a_const_int_r3::non_const_value_type, int>); + static_assert( + std::is_same_v<typename a_const_int_r3::non_const_scalar_array_type, + int** [4]>); } //---------------------------------------- @@ -695,44 +667,43 @@ void test_view_mapping() { ASSERT_EQ(vr1.data(), &data[0]); ASSERT_EQ(cr1.data(), &data[0]); - ASSERT_TRUE((std::is_same<typename T::data_type, int*>::value)); - ASSERT_TRUE((std::is_same<typename T::const_data_type, const int*>::value)); - ASSERT_TRUE((std::is_same<typename T::non_const_data_type, int*>::value)); + ASSERT_TRUE((std::is_same_v<typename T::data_type, int*>)); + ASSERT_TRUE((std::is_same_v<typename T::const_data_type, const int*>)); + ASSERT_TRUE((std::is_same_v<typename T::non_const_data_type, int*>)); - ASSERT_TRUE((std::is_same<typename T::scalar_array_type, int*>::value)); + ASSERT_TRUE((std::is_same_v<typename T::scalar_array_type, int*>)); ASSERT_TRUE( - (std::is_same<typename T::const_scalar_array_type, const int*>::value)); + (std::is_same_v<typename T::const_scalar_array_type, const int*>)); ASSERT_TRUE( - (std::is_same<typename T::non_const_scalar_array_type, int*>::value)); + (std::is_same_v<typename T::non_const_scalar_array_type, int*>)); - ASSERT_TRUE((std::is_same<typename T::value_type, int>::value)); - ASSERT_TRUE((std::is_same<typename T::const_value_type, const int>::value)); - ASSERT_TRUE((std::is_same<typename T::non_const_value_type, int>::value)); + ASSERT_TRUE((std::is_same_v<typename T::value_type, int>)); + ASSERT_TRUE((std::is_same_v<typename T::const_value_type, const int>)); + ASSERT_TRUE((std::is_same_v<typename T::non_const_value_type, int>)); - ASSERT_TRUE((std::is_same<typename T::memory_space, - typename Space::memory_space>::value)); - ASSERT_TRUE((std::is_same<typename T::reference_type, int&>::value)); + ASSERT_TRUE((std::is_same_v<typename T::memory_space, + typename Space::memory_space>)); + ASSERT_TRUE((std::is_same_v<typename T::reference_type, int&>)); ASSERT_EQ(T::rank, size_t(1)); - ASSERT_TRUE((std::is_same<typename C::data_type, const int*>::value)); - ASSERT_TRUE((std::is_same<typename C::const_data_type, const int*>::value)); - ASSERT_TRUE((std::is_same<typename C::non_const_data_type, int*>::value)); + ASSERT_TRUE((std::is_same_v<typename C::data_type, const int*>)); + ASSERT_TRUE((std::is_same_v<typename C::const_data_type, const int*>)); + ASSERT_TRUE((std::is_same_v<typename C::non_const_data_type, int*>)); + ASSERT_TRUE((std::is_same_v<typename C::scalar_array_type, const int*>)); ASSERT_TRUE( - (std::is_same<typename C::scalar_array_type, const int*>::value)); - ASSERT_TRUE( - (std::is_same<typename C::const_scalar_array_type, const int*>::value)); + (std::is_same_v<typename C::const_scalar_array_type, const int*>)); ASSERT_TRUE( - (std::is_same<typename C::non_const_scalar_array_type, int*>::value)); + (std::is_same_v<typename C::non_const_scalar_array_type, int*>)); - ASSERT_TRUE((std::is_same<typename C::value_type, const int>::value)); - ASSERT_TRUE((std::is_same<typename C::const_value_type, const int>::value)); - ASSERT_TRUE((std::is_same<typename C::non_const_value_type, int>::value)); + ASSERT_TRUE((std::is_same_v<typename C::value_type, const int>)); + ASSERT_TRUE((std::is_same_v<typename C::const_value_type, const int>)); + ASSERT_TRUE((std::is_same_v<typename C::non_const_value_type, int>)); - ASSERT_TRUE((std::is_same<typename C::memory_space, - typename Space::memory_space>::value)); - ASSERT_TRUE((std::is_same<typename C::reference_type, const int&>::value)); + ASSERT_TRUE((std::is_same_v<typename C::memory_space, + typename Space::memory_space>)); + ASSERT_TRUE((std::is_same_v<typename C::reference_type, const int&>)); ASSERT_EQ(C::rank, size_t(1)); @@ -764,23 +735,23 @@ void test_view_mapping() { T vr1("vr1", N); C cr1(vr1); - ASSERT_TRUE((std::is_same<typename T::data_type, int*>::value)); - ASSERT_TRUE((std::is_same<typename T::const_data_type, const int*>::value)); - ASSERT_TRUE((std::is_same<typename T::non_const_data_type, int*>::value)); + ASSERT_TRUE((std::is_same_v<typename T::data_type, int*>)); + ASSERT_TRUE((std::is_same_v<typename T::const_data_type, const int*>)); + ASSERT_TRUE((std::is_same_v<typename T::non_const_data_type, int*>)); - ASSERT_TRUE((std::is_same<typename T::scalar_array_type, int*>::value)); + ASSERT_TRUE((std::is_same_v<typename T::scalar_array_type, int*>)); ASSERT_TRUE( - (std::is_same<typename T::const_scalar_array_type, const int*>::value)); + (std::is_same_v<typename T::const_scalar_array_type, const int*>)); ASSERT_TRUE( - (std::is_same<typename T::non_const_scalar_array_type, int*>::value)); + (std::is_same_v<typename T::non_const_scalar_array_type, int*>)); - ASSERT_TRUE((std::is_same<typename T::value_type, int>::value)); - ASSERT_TRUE((std::is_same<typename T::const_value_type, const int>::value)); - ASSERT_TRUE((std::is_same<typename T::non_const_value_type, int>::value)); + ASSERT_TRUE((std::is_same_v<typename T::value_type, int>)); + ASSERT_TRUE((std::is_same_v<typename T::const_value_type, const int>)); + ASSERT_TRUE((std::is_same_v<typename T::non_const_value_type, int>)); - ASSERT_TRUE((std::is_same<typename T::memory_space, - typename Space::memory_space>::value)); - ASSERT_TRUE((std::is_same<typename T::reference_type, int&>::value)); + ASSERT_TRUE((std::is_same_v<typename T::memory_space, + typename Space::memory_space>)); + ASSERT_TRUE((std::is_same_v<typename T::reference_type, int&>)); ASSERT_EQ(T::rank, size_t(1)); ASSERT_EQ(vr1.extent(0), size_t(N)); @@ -818,8 +789,8 @@ void test_view_mapping() { // Testing using space instance for allocation. // The execution space of the memory space must be available for view data // initialization. - if (std::is_same<ExecSpace, - typename ExecSpace::memory_space::execution_space>::value) { + if (std::is_same_v<ExecSpace, + typename ExecSpace::memory_space::execution_space>) { using namespace Kokkos; using memory_space = typename ExecSpace::memory_space; @@ -1038,9 +1009,9 @@ void test_view_mapping() { ASSERT_EQ(a.use_count(), 1); ASSERT_EQ(b.use_count(), 0); -#if !defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_ENABLE_CUDA_LAMBDA) - // Cannot launch host lambda when CUDA lambda is enabled. - +// FIXME_NVCC For some reason, the use count is higher (but still constant) when +// using nvcc. Replacing the lambda with a functor doesn't show this behavior. +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_NVCC)) using host_exec_space = typename Kokkos::Impl::HostMirror<Space>::Space::execution_space; @@ -1048,17 +1019,20 @@ void test_view_mapping() { Kokkos::parallel_reduce( Kokkos::RangePolicy<host_exec_space>(0, 10), KOKKOS_LAMBDA(int, int& e) { - // an unmanaged copy. When the parallel dispatch accepts a move for - // the lambda, this count should become 1. + // for parallel_reduce we copy the functor into a combined + // functor-reducer object (with reference-counting on) before + // constructing the ParallelReduce object (with reference-counting + // turned off). When the parallel dispatch accepts a move for the + // lambda, this count should become 2. - if (a.use_count() != 2) ++e; + if (a.use_count() != 3) ++e; V x = a; - if (a.use_count() != 2) ++e; - if (x.use_count() != 2) ++e; + if (a.use_count() != 3) ++e; + if (x.use_count() != 3) ++e; }, errors); ASSERT_EQ(errors, 0); -#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA ) +#endif } } @@ -1295,7 +1269,7 @@ TEST(TEST_CATEGORY, view_mapping_operator) { } TEST(TEST_CATEGORY, static_extent) { - using T = Kokkos::View<double * [2][3]>; + using T = Kokkos::View<double* [2][3]>; ASSERT_EQ(T::static_extent(1), 2u); ASSERT_EQ(T::static_extent(2), 3u); } diff --git a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp index 9ac4e7da8453eea1363a3ea5bfd94c7b79d80717..4aee035d17a6e581a2a47a56e7980025df3e3116 100644 --- a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp +++ b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp @@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits<int, Kokkos::LayoutLeft, exec_space>; using src_traits = Kokkos::ViewTraits<int, Kokkos::LayoutRight, exec_space>; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View<int, Kokkos::LayoutRight, exec_space> src; Kokkos::View<int, Kokkos::LayoutLeft, exec_space> dst(src); @@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits<int, Kokkos::LayoutRight, exec_space>; using src_traits = Kokkos::ViewTraits<int, Kokkos::LayoutLeft, exec_space>; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View<int, Kokkos::LayoutLeft, exec_space> src; Kokkos::View<int, Kokkos::LayoutRight, exec_space> dst(src); @@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits<int *, Kokkos::LayoutRight, exec_space>; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View<int *, Kokkos::LayoutRight, exec_space> src; Kokkos::View<int *, Kokkos::LayoutLeft, exec_space> dst(src); @@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits<int *, Kokkos::LayoutLeft, exec_space>; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View<int *, Kokkos::LayoutLeft, exec_space> src; Kokkos::View<int *, Kokkos::LayoutRight, exec_space> dst(src); @@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits<int **, Kokkos::LayoutRight, exec_space>; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } { // Assignment of rank-2 Right = Left @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits<int **, Kokkos::LayoutLeft, exec_space>; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } } @@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { using src_traits = dst_traits; using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; - static_assert(std::is_trivially_copyable<mapping>{}, ""); + static_assert(std::is_trivially_copyable<mapping>{}); } } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp index 888abf4ca8dae2cbd8d2148f5be7033f817930c0..01c76f1f0e85fbbacedc7845a659b890830202ae 100644 --- a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp +++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp @@ -40,16 +40,16 @@ struct TestViewMappingSubview { using BS = Kokkos::Subview<BT, range, range, range>; enum { CN0 = 10, CN1 = 11, CN2 = 12 }; - using CT = Kokkos::View<int** * [13][14], ExecSpace>; + using CT = Kokkos::View<int*** [13][14], ExecSpace>; // changing CS to CTS here because when compiling with nvshmem, there is a // define for CS that makes this fail... using CTS = Kokkos::Subview<CT, range, range, range, int, int>; enum { DN0 = 10, DN1 = 11, DN2 = 12, DN3 = 13, DN4 = 14 }; - using DT = Kokkos::View<int** * [DN3][DN4], ExecSpace>; + using DT = Kokkos::View<int*** [DN3][DN4], ExecSpace>; using DS = Kokkos::Subview<DT, int, range, range, range, int>; - using DLT = Kokkos::View<int** * [13][14], Kokkos::LayoutLeft, ExecSpace>; + using DLT = Kokkos::View<int*** [13][14], Kokkos::LayoutLeft, ExecSpace>; using DLS1 = Kokkos::Subview<DLT, range, int, int, int, int>; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 @@ -60,7 +60,7 @@ struct TestViewMappingSubview { "LayoutLeft"); #endif - using DRT = Kokkos::View<int** * [13][14], Kokkos::LayoutRight, ExecSpace>; + using DRT = Kokkos::View<int*** [13][14], Kokkos::LayoutRight, ExecSpace>; using DRS1 = Kokkos::Subview<DRT, int, int, int, int, range>; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 diff --git a/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp b/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp index daf24ce7c0cb67de118e76764ac76022af5a4857..5dec72e1c83c46d1de607fa067eeb8b6fac421d3 100644 --- a/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp +++ b/packages/kokkos/core/unit_test/TestViewMemoryAccessViolation.hpp @@ -170,7 +170,7 @@ TEST(TEST_CATEGORY_DEATH, view_memory_access_violations_from_device) { } #if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL - if (std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value) { + if (std::is_same<ExecutionSpace, Kokkos::SYCL>::value) { GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " "is defined"; } diff --git a/packages/kokkos/core/unit_test/TestViewOfViews.hpp b/packages/kokkos/core/unit_test/TestViewOfViews.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1d53bca336d41c77051d5f48af1ff544ec96a59c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewOfViews.hpp @@ -0,0 +1,129 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace { + +// User-defined types with a View data member +template <class V> +class S { + V v_; + + public: + template <class... Extents> + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + KOKKOS_DEFAULTED_FUNCTION S() = default; +}; + +template <class V> +class N { // not default constructible + V v_; + + public: + template <class... Extents> + N(std::string label, Extents... extents) : v_(std::move(label), extents...) {} +}; + +template <class V> +class H { // constructible and destructible only from on the host side + V v_; + + public: + template <class... Extents> + H(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + H() {} + ~H() {} +}; + +template <class V> +void test_view_of_views_default() { + // assigning a default-constructed view to destruct the inner objects + using VoV = Kokkos::View<V**, Kokkos::HostSpace>; + VoV vov("vov", 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; +#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND + vov(0, 0) = V(); + vov(1, 0) = V(); + vov(0, 1) = V(); +#endif +} + +template <class V> +void test_view_of_views_without_initializing() { + // using placement new to construct the inner objects and explicitly + // calling the destructor + using VoV = Kokkos::View<V**, Kokkos::HostSpace>; + VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3); + V a("a"); + V b("b"); + new (&vov(0, 0)) V(a); + new (&vov(1, 0)) V(a); + new (&vov(0, 1)) V(b); +#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND + vov(0, 0).~V(); + vov(1, 0).~V(); + vov(0, 1).~V(); +#else + // leaks memory +#endif +} + +template <class V> +void test_view_of_views_sequential_host_init() { + // inner views value-initialized sequentially on the host, and also + // sequentially destructed on the host, without the need to cleanup + using VoV = Kokkos::View<V**, Kokkos::HostSpace>; + VoV vov(Kokkos::view_alloc("vov", Kokkos::SequentialHostInit), 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; +} + +TEST(TEST_CATEGORY, view_of_views_default) { + test_view_of_views_default<Kokkos::View<int, TEST_EXECSPACE>>(); + test_view_of_views_default<Kokkos::View<int[4], TEST_EXECSPACE>>(); + // User-defined type with View data member + test_view_of_views_default<S<Kokkos::View<float, TEST_EXECSPACE>>>(); +} + +TEST(TEST_CATEGORY, view_of_views_without_initializing) { + test_view_of_views_without_initializing<Kokkos::View<int, TEST_EXECSPACE>>(); + test_view_of_views_without_initializing< + S<Kokkos::View<float, TEST_EXECSPACE>>>(); + test_view_of_views_without_initializing< + N<Kokkos::View<double, TEST_EXECSPACE>>>(); + test_view_of_views_without_initializing< + H<Kokkos::View<int, TEST_EXECSPACE>>>(); +} + +TEST(TEST_CATEGORY, test_view_of_views_sequential_host_init) { + test_view_of_views_sequential_host_init<Kokkos::View<int, TEST_EXECSPACE>>(); + test_view_of_views_sequential_host_init< + S<Kokkos::View<float, TEST_EXECSPACE>>>(); + test_view_of_views_sequential_host_init< + H<Kokkos::View<int, TEST_EXECSPACE>>>(); +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp b/packages/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp new file mode 100644 index 0000000000000000000000000000000000000000..88602bd613e664f874a7fb07fcba714219df61b4 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp @@ -0,0 +1,175 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include <sstream> + +#include <gtest/gtest.h> + +namespace { + +TEST(TEST_CATEGORY, append_formatted_multidimensional_index) { + using Kokkos::Impl::append_formatted_multidimensional_index; + { + char buffer[64] = "my prefix "; + append_formatted_multidimensional_index(buffer, 1); + EXPECT_STREQ(buffer, "my prefix [1]"); + } + { + char buffer[64] = "I was here"; + append_formatted_multidimensional_index(buffer, 1, 2, 3); + EXPECT_STREQ(buffer, "I was here[1,2,3]"); + } + { + char buffer[64] = "with mixed integer types "; + append_formatted_multidimensional_index(buffer, 1u, -2); + EXPECT_STREQ(buffer, "with mixed integer types [1,-2]"); + } +} + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + +template <class View, class ExecutionSpace> +struct TestViewOutOfBoundAccess { + View v; + static constexpr auto rank = View::rank; + + template <std::size_t... Is> + KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence<Is...>) const { + return v((Is * 1 + Is == 0 ? v.extent(Is) + 3 : 0)...); + } + + KOKKOS_FUNCTION void operator()(int) const { + ++bad_access(std::make_index_sequence<rank>{}); + } + + template <std::size_t... Is> + std::string get_details(std::index_sequence<Is...>) { + std::stringstream ss; + ss << "with indices \\["; + ((ss << (Is == 0 ? v.extent(Is) + 3 : 0) + << (Is == View::rank() - 1 ? "\\]" : ",")), + ...); + ss << " but extents \\["; + ((ss << v.extent(Is) << (Is == View::rank() - 1 ? "\\]" : ",")), ...); + return ss.str(); + } + + auto get_details() { + return get_details(std::make_index_sequence<View::rank()>()); + } + + TestViewOutOfBoundAccess(View w, ExecutionSpace const& s, std::string matcher) + : v(std::move(w)) { + constexpr bool view_accessible_from_execution_space = + Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/typename View::memory_space>::accessible; + EXPECT_TRUE(view_accessible_from_execution_space); + + matcher += ".*" + get_details(); + + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template <class View, class LblOrPtr, std::size_t... Is> +auto make_view_impl(LblOrPtr x, std::index_sequence<Is...>) { + return View(x, (Is + 1)...); +} + +template <class View, class LblOrPtr> +auto make_view(LblOrPtr x) { + return make_view_impl<View>(std::move(x), + std::make_index_sequence<View::rank>()); +} + +template <class ExecutionSpace> +void test_view_out_of_bounds_access() { + ExecutionSpace const exec_space{}; + // clang-format off + using V1 = Kokkos::View<int*, ExecutionSpace>; + using V2 = Kokkos::View<int**, ExecutionSpace>; + using V3 = Kokkos::View<int***, ExecutionSpace>; + using V4 = Kokkos::View<int****, ExecutionSpace>; + using V5 = Kokkos::View<int*****, ExecutionSpace>; + using V6 = Kokkos::View<int******, ExecutionSpace>; + using V7 = Kokkos::View<int*******, ExecutionSpace>; + using V8 = Kokkos::View<int********, ExecutionSpace>; + std::string const prefix = "Kokkos::View ERROR: out of bounds access"; + std::string const lbl = "my_label"; + TestViewOutOfBoundAccess(make_view<V1>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V2>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V3>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V4>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V5>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V6>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V7>(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view<V8>(lbl), exec_space, prefix + ".*" + lbl); + int* const ptr = nullptr; + TestViewOutOfBoundAccess(make_view<V1>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V2>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V3>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V4>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V5>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V6>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V7>(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view<V8>(ptr), exec_space, prefix + ".*UNMANAGED"); + // clang-format on +} + +TEST(TEST_CATEGORY_DEATH, view_out_of_bounds_access) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (false && Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same_v<ExecutionSpace, Kokkos::SYCL>) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same_v<ExecutionSpace, Kokkos::Experimental::OpenMPTarget>) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same<ExecutionSpace, Kokkos::Experimental::OpenACC>::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_view_out_of_bounds_access<ExecutionSpace>(); +} + +#endif + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewRank.cpp b/packages/kokkos/core/unit_test/TestViewRank.cpp index 7ea11afca37a938cdc5660116923aa33e5f05a45..bc6a9584d70c78c648d0b52d082cd9bd84c1fd13 100644 --- a/packages/kokkos/core/unit_test/TestViewRank.cpp +++ b/packages/kokkos/core/unit_test/TestViewRank.cpp @@ -33,9 +33,7 @@ constexpr bool test_view_rank_and_dynamic_rank() { static_assert(!std::is_integral_v<decltype(rank)>); auto rank_preferred = View::rank(); // since 4.0.01 static_assert(std::is_same_v<decltype(rank_preferred), size_t>); - (void)rank; - (void)rank_preferred; - return true; + return rank == rank_preferred; } // clang-format off diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp index 386887d923eacef0ec1299058cf91435b88dca60..7011efcd31a481c65e385f92c26a71c72a0d455b 100644 --- a/packages/kokkos/core/unit_test/TestViewSubview.hpp +++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp @@ -401,7 +401,7 @@ void test_left_0() { template <class Space> void test_left_1(bool use_constr) { using view_type = - Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutLeft, Space>; + Kokkos::View<int**** [2][3][4][5], Kokkos::LayoutLeft, Space>; if (Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible) { @@ -702,7 +702,7 @@ void test_right_0() { template <class Space> void test_right_1(bool use_constr) { using view_type = - Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutRight, Space>; + Kokkos::View<int**** [2][3][4][5], Kokkos::LayoutRight, Space>; if (Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible) { @@ -866,10 +866,10 @@ struct FillView_3D { using view_t = Kokkos::View<int***, Layout, Space>; using rank_t = Kokkos::Rank< view_t::rank, - std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left - : Kokkos::Iterate::Right, - std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left - : Kokkos::Iterate::Right>; + std::is_same_v<Layout, Kokkos::LayoutLeft> ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right, + std::is_same_v<Layout, Kokkos::LayoutLeft> ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right>; using policy_t = Kokkos::MDRangePolicy<exec_t, rank_t>; view_t a; @@ -894,10 +894,10 @@ struct FillView_4D { using view_t = Kokkos::View<int****, Layout, Space>; using rank_t = Kokkos::Rank< view_t::rank, - std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left - : Kokkos::Iterate::Right, - std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left - : Kokkos::Iterate::Right>; + std::is_same_v<Layout, Kokkos::LayoutLeft> ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right, + std::is_same_v<Layout, Kokkos::LayoutLeft> ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right>; using policy_t = Kokkos::MDRangePolicy<exec_t, rank_t>; view_t a; @@ -923,10 +923,10 @@ struct FillView_5D { using view_t = Kokkos::View<int*****, Layout, Space>; using rank_t = Kokkos::Rank< view_t::rank, - std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left - : Kokkos::Iterate::Right, - std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left - : Kokkos::Iterate::Right>; + std::is_same_v<Layout, Kokkos::LayoutLeft> ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right, + std::is_same_v<Layout, Kokkos::LayoutLeft> ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right>; using policy_t = Kokkos::MDRangePolicy<exec_t, rank_t>; view_t a; @@ -1267,56 +1267,56 @@ template <class Space, class LayoutSub, class Layout, class LayoutOrg, void test_2d_subview_3d_impl_layout() { test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int * [N2], LayoutSub, + test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int * [N1][N2], int[N1][N2], LayoutSub, + test_2d_subview_3d_impl_type<Space, int* [N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int * [N1][N2], int * [N2], LayoutSub, + test_2d_subview_3d_impl_type<Space, int* [N1][N2], int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int * [N1][N2], int**, LayoutSub, Layout, + test_2d_subview_3d_impl_type<Space, int* [N1][N2], int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int* * [N2], int[N1][N2], LayoutSub, + test_2d_subview_3d_impl_type<Space, int** [N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int* * [N2], int * [N2], LayoutSub, - Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int* * [N2], int**, LayoutSub, Layout, + test_2d_subview_3d_impl_type<Space, int** [N2], int* [N2], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int** [N2], int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, int***, int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, int***, int * [N2], LayoutSub, Layout, + test_2d_subview_3d_impl_type<Space, int***, int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, int***, int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int * [N2], + test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int * [N1][N2], const int[N1][N2], + test_2d_subview_3d_impl_type<Space, const int* [N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int * [N1][N2], const int * [N2], + test_2d_subview_3d_impl_type<Space, const int* [N1][N2], const int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int * [N1][N2], const int**, + test_2d_subview_3d_impl_type<Space, const int* [N1][N2], const int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int* * [N2], const int[N1][N2], + test_2d_subview_3d_impl_type<Space, const int** [N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int* * [N2], const int * [N2], + test_2d_subview_3d_impl_type<Space, const int** [N2], const int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int* * [N2], const int**, LayoutSub, + test_2d_subview_3d_impl_type<Space, const int** [N2], const int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, const int***, const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_2d_subview_3d_impl_type<Space, const int***, const int * [N2], LayoutSub, + test_2d_subview_3d_impl_type<Space, const int***, const int* [N2], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_2d_subview_3d_impl_type<Space, const int***, const int**, LayoutSub, Layout, LayoutOrg, MemTraits>(); @@ -1349,54 +1349,54 @@ template <class Space, class LayoutSub, class Layout, class LayoutOrg, void test_3d_subview_5d_impl_layout() { test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int * [N3][N4], + test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int* * [N4], + test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int[N2][N3][N4], + test_3d_subview_5d_impl_type<Space, int* [N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int * [N3][N4], + test_3d_subview_5d_impl_type<Space, int* [N1][N2][N3][N4], int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int* * [N4], + test_3d_subview_5d_impl_type<Space, int* [N1][N2][N3][N4], int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int***, LayoutSub, + test_3d_subview_5d_impl_type<Space, int* [N1][N2][N3][N4], int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int[N2][N3][N4], - LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int * [N3][N4], + test_3d_subview_5d_impl_type<Space, int** [N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int* * [N4], + test_3d_subview_5d_impl_type<Space, int** [N2][N3][N4], int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int***, LayoutSub, + test_3d_subview_5d_impl_type<Space, int** [N2][N3][N4], int** [N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int** [N2][N3][N4], int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int[N2][N3][N4], - LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int * [N3][N4], + test_3d_subview_5d_impl_type<Space, int*** [N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int* * [N4], LayoutSub, + test_3d_subview_5d_impl_type<Space, int*** [N3][N4], int* [N3][N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*** [N3][N4], int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int***, LayoutSub, + test_3d_subview_5d_impl_type<Space, int*** [N3][N4], int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int*** * [N4], int[N2][N3][N4], LayoutSub, + test_3d_subview_5d_impl_type<Space, int**** [N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int*** * [N4], int * [N3][N4], LayoutSub, + test_3d_subview_5d_impl_type<Space, int**** [N4], int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int*** * [N4], int* * [N4], LayoutSub, + test_3d_subview_5d_impl_type<Space, int**** [N4], int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int*** * [N4], int***, LayoutSub, Layout, + test_3d_subview_5d_impl_type<Space, int**** [N4], int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, int*****, int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int*****, int * [N3][N4], LayoutSub, + test_3d_subview_5d_impl_type<Space, int*****, int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, int*****, int* * [N4], LayoutSub, Layout, + test_3d_subview_5d_impl_type<Space, int*****, int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, int*****, int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); @@ -1405,66 +1405,64 @@ void test_3d_subview_5d_impl_layout() { const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], - const int * [N3][N4], LayoutSub, Layout, + const int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], - const int* * [N4], LayoutSub, Layout, LayoutOrg, + const int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], const int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], + test_3d_subview_5d_impl_type<Space, const int* [N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], - const int * [N3][N4], LayoutSub, Layout, + test_3d_subview_5d_impl_type<Space, const int* [N1][N2][N3][N4], + const int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], - const int* * [N4], LayoutSub, Layout, LayoutOrg, - MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], - const int***, LayoutSub, Layout, LayoutOrg, + test_3d_subview_5d_impl_type<Space, const int* [N1][N2][N3][N4], + const int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int* [N1][N2][N3][N4], const int***, + LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], + test_3d_subview_5d_impl_type<Space, const int** [N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], - const int * [N3][N4], LayoutSub, Layout, + test_3d_subview_5d_impl_type<Space, const int** [N2][N3][N4], + const int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], - const int* * [N4], LayoutSub, Layout, LayoutOrg, + test_3d_subview_5d_impl_type<Space, const int** [N2][N3][N4], + const int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], const int***, + test_3d_subview_5d_impl_type<Space, const int** [N2][N3][N4], const int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], + test_3d_subview_5d_impl_type<Space, const int*** [N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], - const int * [N3][N4], LayoutSub, Layout, + test_3d_subview_5d_impl_type<Space, const int*** [N3][N4], + const int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], const int* * [N4], + test_3d_subview_5d_impl_type<Space, const int*** [N3][N4], const int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], const int***, + test_3d_subview_5d_impl_type<Space, const int*** [N3][N4], const int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int*** * [N4], - const int[N2][N3][N4], LayoutSub, Layout, - LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int*** * [N4], const int * [N3][N4], + test_3d_subview_5d_impl_type<Space, const int**** [N4], const int[N2][N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int**** [N4], const int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int*** * [N4], const int* * [N4], + test_3d_subview_5d_impl_type<Space, const int**** [N4], const int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int*** * [N4], const int***, + test_3d_subview_5d_impl_type<Space, const int**** [N4], const int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, const int*****, const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int*****, const int * [N3][N4], + test_3d_subview_5d_impl_type<Space, const int*****, const int* [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); - test_3d_subview_5d_impl_type<Space, const int*****, const int* * [N4], + test_3d_subview_5d_impl_type<Space, const int*****, const int** [N4], LayoutSub, Layout, LayoutOrg, MemTraits>(); test_3d_subview_5d_impl_type<Space, const int*****, const int***, LayoutSub, Layout, LayoutOrg, MemTraits>(); @@ -2204,7 +2202,7 @@ struct template <class Space, class Layout> struct TestSubviewStaticSizes { - Kokkos::View<int * [10][5][2], Layout, Space> a; + Kokkos::View<int* [10][5][2], Layout, Space> a; Kokkos::View<int[6][7][8], Layout, Space> b; KOKKOS_INLINE_FUNCTION @@ -2231,7 +2229,7 @@ struct TestSubviewStaticSizes { auto sub_a_4 = Kokkos::subview(a, Kokkos::ALL, 0, Kokkos::ALL, Kokkos::ALL); typename static_expect_same< - /* expected */ int * [5][2], + /* expected */ int* [5][2], /* actual */ typename get_view_type<decltype(sub_a_4)>::type>::type test_4 = 0; @@ -2253,14 +2251,14 @@ struct TestSubviewStaticSizes { auto sub_a_7 = Kokkos::subview(a, Kokkos::ALL, 0, Kokkos::make_pair(0, 1), Kokkos::ALL); typename static_expect_same< - /* expected */ int* * [2], + /* expected */ int** [2], /* actual */ typename get_view_type<decltype(sub_a_7)>::type>::type test_7 = 0; auto sub_a_8 = Kokkos::subview(a, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); typename static_expect_same< - /* expected */ int * [10][5][2], + /* expected */ int* [10][5][2], /* actual */ typename get_view_type<decltype(sub_a_8)>::type>::type test_8 = 0; @@ -2279,7 +2277,7 @@ struct TestSubviewStaticSizes { auto sub_b_3 = Kokkos::subview(b, Kokkos::make_pair(2, 3), Kokkos::ALL, Kokkos::ALL); typename static_expect_same< - /* expected */ int * [7][8], + /* expected */ int* [7][8], /* actual */ typename get_view_type<decltype(sub_b_3)>::type>::type test_11 = 0; @@ -2294,11 +2292,10 @@ template <class Space> struct TestExtentsStaticTests { using test1 = typename static_expect_same< /* expected */ - Kokkos::Experimental::Extents<Kokkos::Experimental::dynamic_extent, - Kokkos::Experimental::dynamic_extent, 1, 2, - 3>, + Kokkos::Experimental::Extents<Kokkos::dynamic_extent, + Kokkos::dynamic_extent, 1, 2, 3>, /* actual */ - typename Kokkos::Impl::ParseViewExtents<double* * [1][2][3]>::type>::type; + typename Kokkos::Impl::ParseViewExtents<double** [1][2][3]>::type>::type; using test2 = typename static_expect_same< /* expected */ diff --git a/packages/kokkos/core/unit_test/TestViewTypedefs.cpp b/packages/kokkos/core/unit_test/TestViewTypedefs.cpp new file mode 100644 index 0000000000000000000000000000000000000000..51f5e524d9c850c905f2ab9b2c40aa2a8f43f655 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewTypedefs.cpp @@ -0,0 +1,274 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +namespace { + +// clang-format off +template<class DataType> +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t<DataType>; +}; + +template<class DataType> +struct data_analysis<DataType*> { + using data_type = typename data_analysis<DataType>::data_type*; + using const_data_type = typename data_analysis<DataType>::const_data_type*; + using runtime_data_type = typename data_analysis<DataType>::runtime_data_type*; + using runtime_const_data_type = typename data_analysis<DataType>::runtime_const_data_type*; + using non_const_data_type = typename data_analysis<DataType>::non_const_data_type*; +}; + +template<class DataType, size_t N> +struct data_analysis<DataType[N]> { + using data_type = typename data_analysis<DataType>::data_type[N]; + using const_data_type = typename data_analysis<DataType>::const_data_type[N]; + using runtime_data_type = typename data_analysis<DataType>::runtime_data_type*; + using runtime_const_data_type = typename data_analysis<DataType>::runtime_const_data_type*; + using non_const_data_type = typename data_analysis<DataType>::non_const_data_type[N]; +}; + +template<class ViewType, class ViewTraitsType, class DataType, class Layout, class Space, class MemoryTraitsType, + class HostMirrorSpace, class ValueType, class ReferenceType> +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v<typename ViewType::data_type, DataType>); + static_assert(std::is_same_v<typename ViewType::const_data_type, typename data_analysis<DataType>::const_data_type>); + static_assert(std::is_same_v<typename ViewType::non_const_data_type, typename data_analysis<DataType>::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v<typename ViewType::scalar_array_type, DataType>); + static_assert(std::is_same_v<typename ViewType::const_scalar_array_type, typename data_analysis<DataType>::const_data_type>); + static_assert(std::is_same_v<typename ViewType::non_const_scalar_array_type, typename data_analysis<DataType>::non_const_data_type>); + static_assert(std::is_same_v<typename ViewType::specialize, void>); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v<typename ViewType::value_type, ValueType>); + static_assert(std::is_same_v<typename ViewType::const_value_type, const ValueType>); + static_assert(std::is_same_v<typename ViewType::non_const_value_type, std::remove_const_t<ValueType>>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v<typename ViewType::array_layout, Layout>); + + // FIXME: should be deprecated and is some complicated impl type + static_assert(!std::is_void_v<typename ViewType::dimension>); + + static_assert(std::is_same_v<typename ViewType::execution_space, typename Space::execution_space>); + static_assert(std::is_same_v<typename ViewType::memory_space, typename Space::memory_space>); + static_assert(std::is_same_v<typename ViewType::device_type, Kokkos::Device<typename ViewType::execution_space, typename ViewType::memory_space>>); + static_assert(std::is_same_v<typename ViewType::memory_traits, MemoryTraitsType>); + static_assert(std::is_same_v<typename ViewType::host_mirror_space, HostMirrorSpace>); + static_assert(std::is_same_v<typename ViewType::size_type, typename ViewType::memory_space::size_type>); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v<typename ViewType::reference_type, ReferenceType>); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v<typename ViewType::pointer_type, ValueType*>); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + static_assert(std::is_same_v<typename ViewType::traits, ViewTraitsType>); + static_assert(std::is_same_v<typename ViewType::array_type, + Kokkos::View<typename ViewType::scalar_array_type, typename ViewType::array_layout, + typename ViewType::device_type, typename ViewTraitsType::hooks_policy, + typename ViewType::memory_traits>>); + static_assert(std::is_same_v<typename ViewType::const_type, + Kokkos::View<typename ViewType::const_data_type, typename ViewType::array_layout, + typename ViewType::device_type, typename ViewTraitsType::hooks_policy, + typename ViewType::memory_traits>>); + static_assert(std::is_same_v<typename ViewType::non_const_type, + Kokkos::View<typename ViewType::non_const_data_type, typename ViewType::array_layout, + typename ViewType::device_type, typename ViewTraitsType::hooks_policy, + typename ViewType::memory_traits>>); + static_assert(std::is_same_v<typename ViewType::host_mirror_type, + Kokkos::View<typename ViewType::non_const_data_type, typename ViewType::array_layout, + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, + typename ViewType::host_mirror_space::memory_space>, + typename ViewTraitsType::hooks_policy>>); + + using uniform_layout_type = std::conditional_t<ViewType::rank()==0 || (ViewType::rank()==0 && + std::is_same_v<Layout, Kokkos::LayoutRight>), + Kokkos::LayoutLeft, Layout>; + + // FIXME: uniformtype removes all memorytraits? + static_assert(std::is_same_v<typename ViewType::uniform_type, + Kokkos::View<typename ViewType::data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_const_type, + Kokkos::View<typename ViewType::const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_type, + Kokkos::View<typename data_analysis<DataType>::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_const_type, + Kokkos::View<typename data_analysis<DataType>::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device<typename ViewType::execution_space, Kokkos::AnonymousSpace>; + static_assert(std::is_same_v<typename ViewType::uniform_nomemspace_type, + Kokkos::View<typename ViewType::data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_const_nomemspace_type, + Kokkos::View<typename ViewType::const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_nomemspace_type, + Kokkos::View<typename data_analysis<DataType>::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v<typename ViewType::uniform_runtime_const_nomemspace_type, + Kokkos::View<typename data_analysis<DataType>::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v<typename ViewType::layout_type, Layout>); + // FIXME: Not supported yet + // static_assert(std::is_same_v<typename ViewType::extents_type, >); + // static_assert(std::is_same_v<typename ViewType::mapping_type, >); + // static_assert(std::is_same_v<typename ViewType::accessor_type, >); + + static_assert(std::is_same_v<typename ViewType::element_type, ValueType>); + // FIXME: should be remove_const_t<element_type> + static_assert(std::is_same_v<typename ViewType::value_type, ValueType>); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v<typename ViewType::index_type, typename Space::memory_space::size_type>); + // FIXME: this isn't given in View since for example SYCL has "int" as its size_type + // static_assert(std::is_same_v<typename ViewType::size_type, std::make_unsigned_t<typename ViewType::index_type>>); + static_assert(std::is_same_v<typename ViewType::rank_type, size_t>); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v<typename ViewType::data_handle_type, typename ViewType::pointer_type>); + static_assert(std::is_same_v<typename ViewType::reference, typename ViewType::reference_type>); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template<class T, class ... ViewArgs> +struct ViewParams {}; + +template<class L, class S, class M, class HostMirrorSpace, class ValueType, class ReferenceType, class T, class ... ViewArgs> +constexpr bool test_view_typedefs(ViewParams<T, ViewArgs...>) { + return test_view_typedefs_impl<Kokkos::View<T, ViewArgs...>, Kokkos::ViewTraits<T, ViewArgs...>, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::DefaultHostExecutionSpace>; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View<int> +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::DefaultExecutionSpace, + // otherwise if unified memory is not on its HostSpace + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following Device type + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, typename Kokkos::DefaultExecutionSpace::memory_space>>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, int, int&>( + ViewParams<int>{})); +} + +// Kokkos::View<int, DefaultExecutionSpace> +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View<int> ...) + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::HostSpace, + // otherwise if unified memory is not on its also HostSpace! + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following memory space ... + Kokkos::DefaultExecutionSpace::memory_space>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, int, int&>( + ViewParams<int, Kokkos::DefaultExecutionSpace>{})); +} + +// Kokkos::View<const float**, Kokkos::HostSpace> +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, const float, const float&>( + ViewParams<const float**, Kokkos::HostSpace>{})); +} + +// Kokkos::View<float*[3], Kokkos::LayoutLeft> +namespace TestFloatP3LayoutLeft { + using layout_type = Kokkos::LayoutLeft; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::DefaultExecutionSpace, + // otherwise if unified memory is not on its HostSpace + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following Device type + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, typename Kokkos::DefaultExecutionSpace::memory_space>>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, float, float&>( + ViewParams<float*[3], Kokkos::LayoutLeft>{})); +} + +// Kokkos::View<float[2][3], Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, float, float&>( + ViewParams<float[2][3], Kokkos::LayoutRight, Kokkos::Device<Kokkos::DefaultHostExecutionSpace, Kokkos::HostSpace>>{})); +} + +// Kokkos::View<int, Kokkos::MemoryTraits<Kokkos::Atomic>> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<Kokkos::Atomic>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t<is_host_exec, Kokkos::DefaultExecutionSpace, + // otherwise if unified memory is not on its HostSpace + std::conditional_t<!has_unified_mem_space, Kokkos::HostSpace, + // otherwise its the following Device type + Kokkos::Device<Kokkos::DefaultHostExecutionSpace, typename Kokkos::DefaultExecutionSpace::memory_space>>>; + static_assert(test_view_typedefs<layout_type, space, memory_traits, host_mirror_space, int, + Kokkos::Impl::AtomicDataElement<Kokkos::ViewTraits<int, Kokkos::MemoryTraits<Kokkos::Atomic>>>>( + ViewParams<int, Kokkos::MemoryTraits<Kokkos::Atomic>>{})); +} +// clang-format on +} // namespace diff --git a/packages/kokkos/core/unit_test/TestView_64bit.hpp b/packages/kokkos/core/unit_test/TestView_64bit.hpp index f30fe2d13275fd03497ae1a055285b35a60452f0..08fda0e44440103d107490d3cc40d5cf75c0b2ef 100644 --- a/packages/kokkos/core/unit_test/TestView_64bit.hpp +++ b/packages/kokkos/core/unit_test/TestView_64bit.hpp @@ -20,7 +20,6 @@ namespace Test { template <class Device> void test_64bit() { -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) // We are running out of device memory on Intel GPUs #ifdef KOKKOS_ENABLE_SYCL int64_t N = 4000000000; @@ -106,7 +105,6 @@ void test_64bit() { (P * (P - 1) / 2) * int64_t(N0 / P) + (N0 % P) * (N0 % P - 1) / 2; ASSERT_EQ(expected, sum0); } -#endif } #ifdef KOKKOS_ENABLE_LARGE_MEM_TESTS diff --git a/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp b/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp index 0509cc62b9641f56bd83a7691503a56f092f4eb1..da8ed9d506951a28f77ec0188e13b45c2c34dde8 100644 --- a/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp +++ b/packages/kokkos/core/unit_test/TestWithoutInitializing.hpp @@ -21,7 +21,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::View<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 5, 6, 7, 8); + Kokkos::View<int**** [1][2][3][4], TEST_EXECSPACE> bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -48,7 +48,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::View<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5); + Kokkos::View<int**** [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -133,8 +133,7 @@ TEST(TEST_CATEGORY, view_alloc) { }, [&](BeginFenceEvent event) { return MatchDiagnostic{ - event.descriptor().find( - "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") != + event.descriptor().find("Kokkos::View::initialization") != std::string::npos}; }); ASSERT_TRUE(success); @@ -155,8 +154,7 @@ TEST(TEST_CATEGORY, view_alloc_exec_space) { }, [&](BeginFenceEvent event) { return MatchDiagnostic{ - event.descriptor().find( - "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") != + event.descriptor().find("Kokkos::View::initialization") != std::string::npos}; }); ASSERT_TRUE(success); @@ -177,8 +175,7 @@ TEST(TEST_CATEGORY, view_alloc_int) { }, [&](BeginFenceEvent event) { return MatchDiagnostic{ - event.descriptor().find( - "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") != + event.descriptor().find("Kokkos::View::initialization") != std::string::npos}; }); ASSERT_TRUE(success); @@ -199,8 +196,7 @@ TEST(TEST_CATEGORY, view_alloc_exec_space_int) { }, [&](BeginFenceEvent event) { return MatchDiagnostic{ - event.descriptor().find( - "Kokkos::Impl::ViewValueFunctor: View init/destroy fence") != + event.descriptor().find("Kokkos::View::initialization") != std::string::npos}; }); ASSERT_TRUE(success); @@ -225,14 +221,30 @@ TEST(TEST_CATEGORY, deep_copy_zero_memset) { listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::View<int*, TEST_EXECSPACE> bla("bla", 8); - auto success = - validate_absence([&]() { Kokkos::deep_copy(bla, 0); }, - [&](BeginParallelForEvent) { - return MatchDiagnostic{true, {"Found begin event"}}; - }, - [&](EndParallelForEvent) { - return MatchDiagnostic{true, {"Found end event"}}; - }); + // for MI300A with unified memory, ZeroMemset uses a parallel for + auto success = false; +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + if constexpr (!std::is_same_v<TEST_EXECSPACE::memory_space, + Kokkos::HostSpace>) + success = validate_existence( + [&]() { Kokkos::deep_copy(bla, 0); }, + [&](BeginParallelForEvent e) { + const bool found = + (e.descriptor().find("Kokkos::ZeroMemset via parallel_for") != + std::string::npos); + return MatchDiagnostic{found, {"Found expected parallel_for label"}}; + }); + else +#endif + success = + validate_absence([&]() { Kokkos::deep_copy(bla, 0); }, + [&](BeginParallelForEvent) { + return MatchDiagnostic{true, {"Found begin event"}}; + }, + [&](EndParallelForEvent) { + return MatchDiagnostic{true, {"Found end event"}}; + }); + ASSERT_TRUE(success); listen_tool_events(Config::DisableAll()); } @@ -241,7 +253,7 @@ TEST(TEST_CATEGORY, resize_exec_space) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::View<int*** * [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5); + Kokkos::View<int**** [1][2][3][4], TEST_EXECSPACE> bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { diff --git a/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp b/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp index 3a552bc31e32fef3f5eeb15cc7559d3df744ebb2..4e70f5dfb4a1c014ea46b0061f8e255363a712fe 100644 --- a/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp +++ b/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp @@ -14,14 +14,15 @@ // //@HEADER +#include <cstdlib> +#include <iostream> #include <string> -struct Up {}; - int main(int argc, char* argv[]) { if (argc != 4 || std::string(argv[1]) != "one" || std::string(argv[2]) != "2" || std::string(argv[3]) != "THREE") { - throw Up{}; + std::cerr << "must be called as `<exe> one 2 THREE`\n"; + return EXIT_FAILURE; } - return 0; + return EXIT_SUCCESS; } diff --git a/packages/kokkos/core/unit_test/UnitTest_CMakeTriBITSCompatibility.cpp b/packages/kokkos/core/unit_test/UnitTest_CMakeTriBITSCompatibility.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2f0f6576740726c1ab50310433d01f86d5c5b38 --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTest_CMakeTriBITSCompatibility.cpp @@ -0,0 +1,33 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <cstdlib> +#include <iostream> +#include <string_view> + +int main(int argc, char* argv[]) { + if (std::getenv("KOKKOS_TEST_TRIBITS_COMPATIBILITY")) { + return EXIT_SUCCESS; + } + if (argc == 2 && std::string_view(argv[1]).find( + "--kokkos-test-tribits-compatibility") == 0) { + return EXIT_SUCCESS; + } + std::cerr << "must be called with `KOKKOS_TEST_TRIBITS_COMPATIBILITY` " + "environment variable set or pass " + "`--kokkos-test-tribits-compatibility` as command line argument"; + return EXIT_FAILURE; +} diff --git a/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp b/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp index b522ac3e69b748165d2c553739d54819388e7bc6..25442146fbad77b67c80373aaaa910d05b88617d 100644 --- a/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -19,22 +19,23 @@ #include <string> #include <thread> -int get_device_count() { +int get_num_devices() { + int num_devices; #if defined(KOKKOS_ENABLE_CUDA) - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - return count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_HIP) - int count; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&count)); - return count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_num_devices(); + num_devices = omp_get_num_devices(); #elif defined(KOKKOS_ENABLE_OPENACC) - return acc_get_num_devices(acc_get_device_type()); + num_devices = acc_get_num_devices(acc_get_device_type()); +#elif defined(KOKKOS_ENABLE_SYCL) + num_devices = sycl::device::get_devices(sycl::info::device_type::gpu).size(); #else - return 0; + num_devices = -1; #endif + assert(num_devices == Kokkos::num_devices()); + return num_devices; } int get_device_id() { @@ -44,15 +45,17 @@ int get_device_id() { #elif defined(KOKKOS_ENABLE_HIP) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - device_id = omp_get_device_num(); + device_id = omp_get_default_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - device_id = acc_get_device_num(acc_get_device_type()); + device_id = acc_get_device_num(acc_get_device_type()); #elif defined(KOKKOS_ENABLE_SYCL) - // FIXME_SYCL ? - assert(false); - return -2; + // Not able to query the underlying runtime because there is no such thing as + // device currently being used with SYCL. We go through the Kokkos runtime + // which makes the assert below pointless but it still let us check that + // Kokkos selected the device we asked for from the Python tests. + device_id = Kokkos::device_id(); #else - device_id = -1; + device_id = -1; #endif assert(device_id == Kokkos::device_id()); return device_id; @@ -68,6 +71,14 @@ int get_max_threads() { #endif } +int get_hwloc_enabled() { +#ifdef KOKKOS_ENABLE_HWLOC + return 1; +#else + return 0; +#endif +} + int get_num_threads() { int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); assert(num_threads == Kokkos::num_threads()); @@ -90,9 +101,10 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(num_threads); KOKKOS_TEST_PRINT_FLAG(max_threads); KOKKOS_TEST_PRINT_FLAG(device_id); - KOKKOS_TEST_PRINT_FLAG(device_count); + KOKKOS_TEST_PRINT_FLAG(num_devices); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); + KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); #undef KOKKOS_TEST_PRINT_FLAG diff --git a/packages/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp b/packages/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2176f3ef059f7efa9e25f9b45ee247f3e35a421 --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTest_ScopeGuard.cpp @@ -0,0 +1,155 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <cstdlib> +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace { + +/** + * Fixture that checks Kokkos is neither initialized nor finalized before and + * after the test. + */ +class AssertEnvironmentTest : public ::testing::Test { + protected: + void SetUp() override { + ASSERT_FALSE(Kokkos::is_initialized()); + ASSERT_FALSE(Kokkos::is_finalized()); + } + + void TearDown() override { + ASSERT_FALSE(Kokkos::is_initialized()); + ASSERT_FALSE(Kokkos::is_finalized()); + } +}; + +using scope_guard_DeathTest = AssertEnvironmentTest; + +/** + * Test to create a scope guard normally. + */ +TEST_F(scope_guard_DeathTest, create) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + // run it in a different process so side effects are not kept + EXPECT_EXIT( + { + { + Kokkos::ScopeGuard guard{}; + + if (!Kokkos::is_initialized()) std::exit(EXIT_FAILURE); + if (Kokkos::is_finalized()) std::exit(EXIT_FAILURE); + } + + if (Kokkos::is_initialized()) std::exit(EXIT_FAILURE); + if (!Kokkos::is_finalized()) std::exit(EXIT_FAILURE); + + std::exit(EXIT_SUCCESS); + }, + testing::ExitedWithCode(EXIT_SUCCESS), ""); +} + +/** + * Test to create a scope guard with an argument. + */ +TEST_F(scope_guard_DeathTest, create_argument) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + // run it in a different process so side effects are not kept + EXPECT_EXIT( + { + { + Kokkos::InitializationSettings settings{}; + Kokkos::ScopeGuard guard{settings}; + } + + std::exit(EXIT_SUCCESS); + }, + testing::ExitedWithCode(EXIT_SUCCESS), ""); +} + +/** + * Test to create another scope guard when one has been created. + */ +TEST_F(scope_guard_DeathTest, create_while_initialize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + Kokkos::ScopeGuard guard1{}; + + // create a second scope guard while there is one already existing + Kokkos::ScopeGuard guard2{}; + }, + "Creating a ScopeGuard while Kokkos is initialized"); +} + +/** + * Test to create a scope guard when initialization has been done manually. + */ +TEST_F(scope_guard_DeathTest, create_after_initialize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + Kokkos::initialize(); + + // create a scope guard after manual initialization + Kokkos::ScopeGuard guard{}; + }, + "Creating a ScopeGuard while Kokkos is initialized"); +} + +/** + * Test to create another scope guard when one has been destroyed. + */ +TEST_F(scope_guard_DeathTest, create_after_finalize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + { Kokkos::ScopeGuard guard1{}; } + + // create a second scope guard while the first one has been destroyed + // already + Kokkos::ScopeGuard guard2{}; + }, + "Creating a ScopeGuard after Kokkos was finalized"); +} + +/** + * Test to destroy a scope guard when finalization has been done manually. + */ +TEST_F(scope_guard_DeathTest, destroy_after_finalize) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + EXPECT_DEATH( + { + // create a scope guard and finalize it manually + Kokkos::ScopeGuard guard{}; + Kokkos::finalize(); + }, + "Destroying a ScopeGuard after Kokkos was finalized"); +} + +/** + * Static tests + */ + +// Test scope guard is not copyable. +static_assert(!std::is_copy_assignable<Kokkos::ScopeGuard>()); +static_assert(!std::is_copy_constructible<Kokkos::ScopeGuard>()); + +// Test scope guard is not movable. +static_assert(!std::is_move_assignable<Kokkos::ScopeGuard>()); +static_assert(!std::is_move_constructible<Kokkos::ScopeGuard>()); + +} // namespace diff --git a/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp index 28704de29ea619f333c9def78443a9ab45bd28dc..7b00fea2443430141b66844c7846cb2d52430df8 100644 --- a/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp @@ -21,8 +21,8 @@ #define TEST_CATEGORY cuda_hostpinned #define TEST_CATEGORY_DEATH cuda_hostpinned_DeathTest -//#define TEST_EXECSPACE -// Kokkos::Device<Kokkos::Cuda,Kokkos::CudaHostPinnedSpace> +// #define TEST_EXECSPACE +// Kokkos::Device<Kokkos::Cuda,Kokkos::CudaHostPinnedSpace> #define TEST_EXECSPACE Kokkos::CudaHostPinnedSpace #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp index d3a7cdbea530ff655770a6fd96b7f0bffae0ed0b..c6a2aa9f201fc364218e06263538087e2018266a 100644 --- a/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 3 #define TEST_CATEGORY_DEATH hpx_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::HPX +#define TEST_CATEGORY_FIXTURE(name) hpx_##name #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp index 0c4e4b7e11953be5be1eefce5d453e08c605ce55..6105eadf14fefb59640f2ee3dedd1221ec3b1347 100644 --- a/packages/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestOpenACC_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 8 #define TEST_CATEGORY_DEATH openacc_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenACC +#define TEST_CATEGORY_FIXTURE(name) openacc_##name #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp index 235b34ffab78d214301d35cbf2452dff9da0e377..921cff7890202b657643d7bfa07dc0a499194bc4 100644 --- a/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 4 #define TEST_CATEGORY_DEATH openmptarget_DeathTest #define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget +#define TEST_CATEGORY_FIXTURE(name) openmptarget_##name #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp index 465b066c32c98c242c52bbd0a4d18c4c40430184..4fc6b2e415211b5dd526c23500e0d2aaaf8d4004 100644 --- a/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp @@ -21,6 +21,6 @@ #define TEST_CATEGORY sycl_host_usm #define TEST_CATEGORY_DEATH sycl_host_usm_DeathTest -#define TEST_EXECSPACE Kokkos::Experimental::SYCLHostUSMSpace +#define TEST_EXECSPACE Kokkos::SYCLHostUSMSpace #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp index e377f73eec28ca35ff873930bd3c9555f1a539f4..2fef64d13528d1515727974fcb9079c90355594a 100644 --- a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp @@ -21,6 +21,6 @@ #define TEST_CATEGORY sycl_shared_usm #define TEST_CATEGORY_DEATH sycl_shared_usm_DeathTest -#define TEST_EXECSPACE Kokkos::Experimental::SYCLSharedUSMSpace +#define TEST_EXECSPACE Kokkos::SYCLSharedUSMSpace #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp index 8e1b18c9acd9b69509f1793f9563699ddcd43052..192a7bb6380972442ef2f620f1cf16f8fc79dd94 100644 --- a/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp @@ -22,6 +22,7 @@ #define TEST_CATEGORY sycl #define TEST_CATEGORY_NUMBER 7 #define TEST_CATEGORY_DEATH sycl_DeathTest -#define TEST_EXECSPACE Kokkos::Experimental::SYCL +#define TEST_EXECSPACE Kokkos::SYCL +#define TEST_CATEGORY_FIXTURE(name) sycl_##name #endif diff --git a/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp index 13b0b653f21ec3061ffe0084fe9f52b10f6c98f6..ae8ac608339cb261183b0eff87618c79b4837228 100644 --- a/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp +++ b/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp @@ -23,5 +23,6 @@ #define TEST_CATEGORY_NUMBER 1 #define TEST_CATEGORY_DEATH threads_DeathTest #define TEST_EXECSPACE Kokkos::Threads +#define TEST_CATEGORY_FIXTURE(name) threads_##name #endif diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash index 8fe8e2b5ecea429f750eb4ad8eff6588f4ae9691..8bc8ef21cd028d380e9f0c87669db9101ac27ff7 100755 --- a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Graph.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..45a2ad7aac0e850055e55cc04ef14672a4516b97 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Graph.cpp @@ -0,0 +1,151 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <filesystem> +#include <fstream> +#include <regex> + +#include <TestCuda_Category.hpp> +#include <Kokkos_Core.hpp> +#include <Kokkos_Graph.hpp> + +#include <gtest/gtest.h> + +namespace { + +template <typename ViewType> +struct Increment { + ViewType data; + + KOKKOS_FUNCTION + void operator()(const int) const { ++data(); } +}; + +class TEST_CATEGORY_FIXTURE(GraphInterOp) : public ::testing::Test { + public: + using execution_space = Kokkos::Cuda; + using view_t = + Kokkos::View<int, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic>>; + using graph_t = Kokkos::Experimental::Graph<execution_space>; + + void SetUp() override { + data = view_t(Kokkos::view_alloc(exec, "witness")); + + graph = Kokkos::Experimental::create_graph(exec, [&](const auto& root) { + root.then_parallel_for(1, Increment<view_t>{data}); + }); + } + + protected: + execution_space exec{}; + view_t data; + std::optional<graph_t> graph; +}; + +// This test checks the promises of Kokkos::Graph against its +// underlying Cuda native objects. +TEST_F(TEST_CATEGORY_FIXTURE(GraphInterOp), promises_on_native_objects) { + // Before instantiation, the Cuda graph is valid, but the Cuda executable + // graph is still null. + cudaGraph_t cuda_graph = graph->native_graph(); + + ASSERT_NE(cuda_graph, nullptr); + ASSERT_EQ(graph->native_graph_exec(), nullptr); + + // After instantiation, both native objects are valid. + graph->instantiate(); + + cudaGraphExec_t cuda_graph_exec = graph->native_graph_exec(); + + ASSERT_EQ(graph->native_graph(), cuda_graph); + ASSERT_NE(cuda_graph_exec, nullptr); + + // Submission should not affect the underlying objects. + graph->submit(); + + ASSERT_EQ(graph->native_graph(), cuda_graph); + ASSERT_EQ(graph->native_graph_exec(), cuda_graph_exec); +} + +// Count the number of nodes. This is useful to ensure no spurious +// (possibly empty) node is added. +TEST_F(TEST_CATEGORY_FIXTURE(GraphInterOp), count_nodes) { + graph->instantiate(); + + size_t num_nodes; + + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphGetNodes(graph->native_graph(), nullptr, &num_nodes)); + + ASSERT_EQ(num_nodes, 2u); +} + +// Use native Cuda graph to generate a DOT representation. +TEST_F(TEST_CATEGORY_FIXTURE(GraphInterOp), debug_dot_print) { +#if CUDA_VERSION < 11600 + GTEST_SKIP() << "Export a graph to DOT requires Cuda 11.6."; +#elif defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 9 + GTEST_SKIP() + << "The GNU C++ Library (libstdc++) versions less than 9.1 " + "require linking with `-lstdc++fs` when using std::filesystem"; +#elif defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 110000 + GTEST_SKIP() + << "The LLVM C++ Standard Library (libc++) versions less than " + "11 require linking with `-lc++fs` when using std::filesystem"; +#else + graph->instantiate(); + + const auto dot = std::filesystem::temp_directory_path() / "cuda_graph.dot"; + + // Convert path to string then to const char * to make it work on Windows. + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphDebugDotPrint(graph->native_graph(), dot.string().c_str(), + cudaGraphDebugDotFlagsVerbose)); + + ASSERT_TRUE(std::filesystem::exists(dot)); + ASSERT_GT(std::filesystem::file_size(dot), 0u); + + // We could write a check against the full kernel's function signature, but + // it would make the test rely too much on internal implementation details. + // Therefore, we just look for the functor and policy. Note that the + // signature is mangled in the 'dot' output. + const std::string expected("[A-Za-z0-9_]+Increment[A-Za-z0-9_]+RangePolicy"); + + std::stringstream buffer; + buffer << std::ifstream(dot).rdbuf(); + + ASSERT_TRUE(std::regex_search(buffer.str(), std::regex(expected))) + << "Could not find expected signature regex " << std::quoted(expected) + << " in " << dot; +#endif +} + +// Ensure that the graph has been instantiated with the default flag. +TEST_F(TEST_CATEGORY_FIXTURE(GraphInterOp), instantiation_flags) { +#if CUDA_VERSION < 12000 + GTEST_SKIP() << "Graph instantiation flag inspection requires Cuda 12."; +#else + graph->instantiate(); + unsigned long long flags = + Kokkos::Experimental::finite_max_v<unsigned long long>; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGraphExecGetFlags(graph->native_graph_exec(), &flags)); + + ASSERT_EQ(flags, 0u); +#endif +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b4b8792ac4b2723a448c62448701800e3359aa00 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,108 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <TestCuda_Category.hpp> +#include <TestMultiGPU.hpp> + +namespace { + +struct StreamsAndDevices { + std::array<cudaStream_t, 2> streams; + std::array<int, 2> devices; + + StreamsAndDevices() { + int n_devices; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); + + devices = {0, n_devices - 1}; + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i])); + } + } + StreamsAndDevices(const StreamsAndDevices &) = delete; + StreamsAndDevices &operator=(const StreamsAndDevices &) = delete; + ~StreamsAndDevices() { + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i])); + } + } +}; + +std::array<TEST_EXECSPACE, 2> get_execution_spaces( + const StreamsAndDevices &streams_and_devices) { + TEST_EXECSPACE exec0(streams_and_devices.streams[0]); + TEST_EXECSPACE exec1(streams_and_devices.streams[1]); + + // Must return void to use ASSERT_EQ + [&]() { + ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]); + ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]); + }(); + + return {exec0, exec1}; +} + +TEST(cuda_multi_gpu, managed_views) { + StreamsAndDevices streams_and_devices; + { + std::array<TEST_EXECSPACE, 2> execs = + get_execution_spaces(streams_and_devices); + + Kokkos::View<int *, TEST_EXECSPACE> view0( + Kokkos::view_alloc("v0", execs[0]), 100); + Kokkos::View<int *, TEST_EXECSPACE> view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); + } +} + +TEST(cuda_multi_gpu, unmanaged_views) { + StreamsAndDevices streams_and_devices; + { + std::array<TEST_EXECSPACE, 2> execs = + get_execution_spaces(streams_and_devices); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device())); + int *p0; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast<void **>(&p0), sizeof(int) * 100)); + Kokkos::View<int *, TEST_EXECSPACE> view0(p0, 100); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device())); + int *p; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast<void **>(&p), sizeof(int) * 100)); + Kokkos::View<int *, TEST_EXECSPACE> view(p, 100); + + test_policies(execs[0], view0, execs[1], view); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p)); + } +} + +TEST(cuda_multi_gpu, scratch_space) { + StreamsAndDevices streams_and_devices; + { + std::array<TEST_EXECSPACE, 2> execs = + get_execution_spaces(streams_and_devices); + + test_scratch(execs[0], execs[1]); + } +} +} // namespace diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp index ae603101abb32ce2b701886f11307ff1b10ac210..f40af99e7c2827562332d87e8df04074b0307e55 100644 --- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -29,200 +29,182 @@ __global__ void test_cuda_spaces_int_value(int *ptr) { TEST(cuda, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HostSpace>::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::CudaHostPinnedSpace>::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::CudaSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::CudaSpace>::accessible, - ""); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaSpace>::accessible); +#else + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaSpace>::accessible); +#endif static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::CudaUVMSpace>::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::CudaUVMSpace>::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::CudaSpace>::assignable, - ""); + Kokkos::CudaSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::CudaUVMSpace>::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert( - !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::CudaHostPinnedSpace>::assignable, - ""); + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::CudaHostPinnedSpace>::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::HostSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::HostSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, - Kokkos::HostSpace>::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::HostSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::CudaUVMSpace>::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::CudaSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::CudaSpace>::accessible, - ""); + Kokkos::CudaSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::HostSpace>::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::HostSpace>::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::CudaHostPinnedSpace>::assignable, - ""); + Kokkos::HostSpace>::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, - Kokkos::CudaHostPinnedSpace>::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::CudaHostPinnedSpace>::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::HostSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::CudaSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::CudaSpace>::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::CudaUVMSpace>::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, - Kokkos::CudaUVMSpace>::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible, - ""); + !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible, - ""); + Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible); static_assert(Kokkos::SpaceAccessibility<Kokkos::Cuda, - Kokkos::CudaUVMSpace>::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility<Kokkos::Cuda, - Kokkos::CudaHostPinnedSpace>::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace, - Kokkos::CudaSpace>::accessible, - ""); + Kokkos::CudaSpace>::accessible); +#else + static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::CudaSpace>::accessible); +#endif static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, - Kokkos::CudaUVMSpace>::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility<Kokkos::HostSpace, - Kokkos::CudaHostPinnedSpace>::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); +#else + static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space, + Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::CudaSpace>>::value); +#endif static_assert( std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space, Kokkos::Device<Kokkos::HostSpace::execution_space, - Kokkos::CudaUVMSpace>>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space, - Kokkos::CudaHostPinnedSpace>::value, - ""); + Kokkos::CudaHostPinnedSpace>::value); static_assert(std::is_same<Kokkos::Device<Kokkos::HostSpace::execution_space, Kokkos::CudaUVMSpace>, Kokkos::Device<Kokkos::HostSpace::execution_space, - Kokkos::CudaUVMSpace>>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space, + Kokkos::HostSpace>::accessible); - static_assert( - Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert(Kokkos::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space, + Kokkos::HostSpace>::accessible); #ifdef KOKKOS_ENABLE_CUDA_UVM using uvm_view = Kokkos::View<double *, Kokkos::CudaUVMSpace>; static_assert(std::is_same<uvm_view::HostMirror::execution_space, diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp index 4a70d9e79beb8b96fedb97471e66d9512027323f..b88f052ea2e143da48c5be7947d64b4d09c7361c 100644 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp @@ -32,16 +32,13 @@ TEST(TEST_CATEGORY, host_space_access) { Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space; static_assert(Kokkos::SpaceAccessibility<host_exec_space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible, - ""); + Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible, - ""); + Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible); } } // namespace Test diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp deleted file mode 100644 index 348b9feeab04b1bf12637c0fa8e0995550c1efca..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp deleted file mode 100644 index a77a55ea65303e8a34f8bb17d4e8923e749382fb..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp deleted file mode 100644 index 1b6a140920c85865cf458f408a2c42ee2496dbff..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp deleted file mode 100644 index 316bc85526f4cd947d55b1dd594b95ee8756d036..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp deleted file mode 100644 index 6344960a1cfe9229557f694c24c2a1f7fe5d531d..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp deleted file mode 100644 index 4515174b82b136b65cca1bfce7f7b0bb90a9a6ef..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp deleted file mode 100644 index 7ead50f0944e0e5a78fc1172ed4dd746974c34d2..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp deleted file mode 100644 index e12b9b3894aeb29da67ff78535c4207a93b83b98..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp deleted file mode 100644 index 959d0ab7503e15465fd06e0cf0beab510eecb58e..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp deleted file mode 100644 index 07d841519dcf33e9d84e0d8bbcabfcb495056db4..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp deleted file mode 100644 index 042a515b16acbc5c6656be06a8b92f6b669720e4..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp deleted file mode 100644 index dba401e5bcf9fc2be70195f1bd19c40cfb4c04b4..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp deleted file mode 100644 index a44c58bdb55adb261e39af050df443493bc0fb23..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp deleted file mode 100644 index cac0841dd8324de21722df8f3861acd1cb2a6303..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp deleted file mode 100644 index bafe3b3fd2af39bd626ec29fbdfdde40cc776bc3..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp deleted file mode 100644 index 3a4dd9d2533daa2019795085c0cc886927840051..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp deleted file mode 100644 index 4e92aae565a3be70762c68c7e41c6b0dc9251fda..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp deleted file mode 100644 index 44b8f3428d920fb99f08b56d556040615e34c2a7..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp index d81c71499fa14fe79e613171d5398514287f2364..df6912451c5eb73218744de1c17ce7ac3d709bae 100644 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeViewAPI.cpp @@ -39,7 +39,7 @@ struct TestViewAPI< using view_type = Kokkos::View<data_type, layout_type, space_type, traits_type>; using alloc_layout_type = - std::conditional_t<std::is_same<layout_type, Kokkos::LayoutStride>::value, + std::conditional_t<std::is_same_v<layout_type, Kokkos::LayoutStride>, Kokkos::LayoutLeft, layout_type>; using d_alloc_type = Kokkos::View<data_type, alloc_layout_type, space_type>; using h_alloc_type = typename Kokkos::View<data_type, alloc_layout_type, @@ -66,44 +66,44 @@ using compatible_extents_test_types = ::testing::Types< std::tuple<int[5], LayoutLeft, _sizes<>, _sizes<5>>, std::tuple<int*, LayoutLeft, _sizes<5>, _sizes<5>>, std::tuple<int[5][10], LayoutLeft, _sizes<>, _sizes<5, 10>>, - std::tuple<int * [10], LayoutLeft, _sizes<5>, _sizes<5, 10>>, + std::tuple<int* [10], LayoutLeft, _sizes<5>, _sizes<5, 10>>, std::tuple<int**, LayoutLeft, _sizes<5, 10>, _sizes<5, 10>>, std::tuple<int[5][10][15], LayoutLeft, _sizes<>, _sizes<5, 10, 15>>, - std::tuple<int * [10][15], LayoutLeft, _sizes<5>, _sizes<5, 10, 15>>, - std::tuple<int* * [15], LayoutLeft, _sizes<5, 10>, _sizes<5, 10, 15>>, + std::tuple<int* [10][15], LayoutLeft, _sizes<5>, _sizes<5, 10, 15>>, + std::tuple<int** [15], LayoutLeft, _sizes<5, 10>, _sizes<5, 10, 15>>, std::tuple<int***, LayoutLeft, _sizes<5, 10, 15>, _sizes<5, 10, 15>>, // LayoutRight std::tuple<int, LayoutRight, _sizes<>, _sizes<>>, std::tuple<int[5], LayoutRight, _sizes<>, _sizes<5>>, std::tuple<int*, LayoutRight, _sizes<5>, _sizes<5>>, std::tuple<int[5][10], LayoutRight, _sizes<>, _sizes<5, 10>>, - std::tuple<int * [10], LayoutRight, _sizes<5>, _sizes<5, 10>>, + std::tuple<int* [10], LayoutRight, _sizes<5>, _sizes<5, 10>>, std::tuple<int**, LayoutRight, _sizes<5, 10>, _sizes<5, 10>>, std::tuple<int[5][10][15], LayoutRight, _sizes<>, _sizes<5, 10, 15>>, - std::tuple<int * [10][15], LayoutRight, _sizes<5>, _sizes<5, 10, 15>>, - std::tuple<int* * [15], LayoutRight, _sizes<5, 10>, _sizes<5, 10, 15>>, + std::tuple<int* [10][15], LayoutRight, _sizes<5>, _sizes<5, 10, 15>>, + std::tuple<int** [15], LayoutRight, _sizes<5, 10>, _sizes<5, 10, 15>>, std::tuple<int***, LayoutRight, _sizes<5, 10, 15>, _sizes<5, 10, 15>>, // LayoutStride std::tuple<int, LayoutStride, _sizes<>, _sizes<>>, std::tuple<int[5], LayoutStride, _sizes<>, _sizes<5>>, std::tuple<int*, LayoutStride, _sizes<5>, _sizes<5>>, std::tuple<int[5][10], LayoutStride, _sizes<>, _sizes<5, 10>>, - std::tuple<int * [10], LayoutStride, _sizes<5>, _sizes<5, 10>>, + std::tuple<int* [10], LayoutStride, _sizes<5>, _sizes<5, 10>>, std::tuple<int**, LayoutStride, _sizes<5, 10>, _sizes<5, 10>>, std::tuple<int[5][10][15], LayoutStride, _sizes<>, _sizes<5, 10, 15>>, - std::tuple<int * [10][15], LayoutStride, _sizes<5>, _sizes<5, 10, 15>>, - std::tuple<int* * [15], LayoutStride, _sizes<5, 10>, _sizes<5, 10, 15>>, + std::tuple<int* [10][15], LayoutStride, _sizes<5>, _sizes<5, 10, 15>>, + std::tuple<int** [15], LayoutStride, _sizes<5, 10>, _sizes<5, 10, 15>>, std::tuple<int***, LayoutStride, _sizes<5, 10, 15>, _sizes<5, 10, 15>>, // Degenerated Sizes std::tuple<int*, LayoutLeft, _sizes<0>, _sizes<0>>, - std::tuple<int * [10], LayoutLeft, _sizes<0>, _sizes<0, 10>>, - std::tuple<int* * [15], LayoutLeft, _sizes<0, 0>, _sizes<0, 0, 15>>, + std::tuple<int* [10], LayoutLeft, _sizes<0>, _sizes<0, 10>>, + std::tuple<int** [15], LayoutLeft, _sizes<0, 0>, _sizes<0, 0, 15>>, std::tuple<int*, LayoutRight, _sizes<0>, _sizes<0>>, - std::tuple<int * [10], LayoutRight, _sizes<0>, _sizes<0, 10>>, - std::tuple<int* * [15], LayoutRight, _sizes<0, 0>, _sizes<0, 0, 15>>, + std::tuple<int* [10], LayoutRight, _sizes<0>, _sizes<0, 10>>, + std::tuple<int** [15], LayoutRight, _sizes<0, 0>, _sizes<0, 0, 15>>, std::tuple<int*, LayoutStride, _sizes<0>, _sizes<0>>, - std::tuple<int * [10], LayoutStride, _sizes<0>, _sizes<0, 10>>, - std::tuple<int* * [15], LayoutStride, _sizes<0, 0>, _sizes<0, 0, 15>>>; + std::tuple<int* [10], LayoutStride, _sizes<0>, _sizes<0, 10>>, + std::tuple<int** [15], LayoutStride, _sizes<0, 0>, _sizes<0, 0, 15>>>; TYPED_TEST_SUITE(TestViewAPI, compatible_extents_test_types, ); diff --git a/packages/kokkos/core/unit_test/diffconfig.sh b/packages/kokkos/core/unit_test/diffconfig.sh deleted file mode 100755 index 0c8836ff83ca93d5293a986fb68f3a05b2291f51..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/diffconfig.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# CMake and Make tests run in separate directories -# The mapping of ARCH to #define is very complicated -# so diff is used instead of grepping -if test "`basename $PWD`" = "cmaketest"; then - outfile=$1 - resfile=../results/$1 -else - outfile=config/tmpstore/$1 - resfile=config/results/$1 -fi - -diff=`diff $outfile $resfile 2>&1 | grep -e define -e "such file"` -if test -z "$diff"; then - echo Passed -else - echo Failed: $diff -fi diff --git a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt index f792b03ed8807856627eddd1ba02ce3ef75aadfb..5c72b2b828f2d1bbe7ee5f0b9d985d89f00a107f 100644 --- a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt +++ b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt @@ -2,19 +2,18 @@ # but we just try to compile them. # Globbing all the header filenames to test for self-containment and presence of header guards -SET(BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../") -file(GLOB KOKKOS_CORE_HEADERS RELATIVE ${BASE_DIR}/core/src - ${BASE_DIR}/core/src/*.hpp ${BASE_DIR}/core/src/*.h) -file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src - ${BASE_DIR}/containers/src/*.hpp) -file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src - ${BASE_DIR}/algorithms/src/*.hpp) +set(BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../") +file(GLOB KOKKOS_CORE_HEADERS RELATIVE ${BASE_DIR}/core/src ${BASE_DIR}/core/src/*.hpp ${BASE_DIR}/core/src/*.h) +file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src ${BASE_DIR}/containers/src/*.hpp) +file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src ${BASE_DIR}/algorithms/src/*.hpp) -if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) +# erroring out when deprecated code is disabled and raising warnings that are treated as errors in the CI otherwise +if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 OR Kokkos_ENABLE_DEPRECATION_WARNINGS) list(REMOVE_ITEM KOKKOS_CONTAINERS_HEADERS "Kokkos_Vector.hpp") + list(REMOVE_ITEM KOKKOS_CORE_HEADERS "Kokkos_Future.hpp" "Kokkos_TaskScheduler.hpp") endif() -foreach (_header ${KOKKOS_CORE_HEADERS} ${KOKKOS_CONTAINERS_HEADERS} ${KOKKOS_ALGORITHMS_HEADERS}) +foreach(_header ${KOKKOS_CORE_HEADERS} ${KOKKOS_CONTAINERS_HEADERS} ${KOKKOS_ALGORITHMS_HEADERS}) string(REGEX REPLACE "[\./]" "_" header_test_name ${_header}) set(header_test_name Kokkos_HeaderSelfContained_${header_test_name}) set_source_files_properties(tstHeader.cpp PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Graph.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Graph.cpp deleted file mode 100644 index 405cb76c643cc90e3e0228d41d8439e36aa7a500..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/hip/TestHIP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <TestHIP_Category.hpp> -#include <TestGraph.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Graph.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..90633c879483e48dcc8ffe541fa0489614f4ed18 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Graph.cpp @@ -0,0 +1,127 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <filesystem> +#include <fstream> +#include <regex> + +#include <TestHIP_Category.hpp> +#include <Kokkos_Core.hpp> +#include <Kokkos_Graph.hpp> + +#include <gtest/gtest.h> + +namespace { + +template <typename ViewType> +struct Increment { + ViewType data; + + KOKKOS_FUNCTION + void operator()(const int) const { ++data(); } +}; + +// This test checks the promises of Kokkos::Graph against its +// underlying HIP native objects. +TEST(TEST_CATEGORY, graph_promises_on_native_objects) { +#if !defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + GTEST_SKIP() << "This test will not work without native graph support"; +#else + auto graph = Kokkos::Experimental::create_graph<Kokkos::HIP>(); + + auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph); + + // Before instantiation, the HIP graph is valid, but the HIP executable + // graph is still null. + hipGraph_t hip_graph = graph.native_graph(); + + ASSERT_NE(hip_graph, nullptr); + ASSERT_EQ(graph.native_graph_exec(), nullptr); + + // After instantiation, both native objects are valid. + graph.instantiate(); + + hipGraphExec_t hip_graph_exec = graph.native_graph_exec(); + + ASSERT_EQ(graph.native_graph(), hip_graph); + ASSERT_NE(hip_graph_exec, nullptr); + + // Submission should not affect the underlying objects. + graph.submit(); + + ASSERT_EQ(graph.native_graph(), hip_graph); + ASSERT_EQ(graph.native_graph_exec(), hip_graph_exec); +#endif +} + +// Use native HIP graph to generate a DOT representation. +TEST(TEST_CATEGORY, graph_instantiate_and_debug_dot_print) { +#if !defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + GTEST_SKIP() << "This test will not work without native graph support"; +#elif defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 9 + GTEST_SKIP() + << "The GNU C++ Library (libstdc++) versions less than 9.1 " + "require linking with `-lstdc++fs` when using std::filesystem"; +#elif defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 110000 + GTEST_SKIP() + << "The LLVM C++ Standard Library (libc++) versions less than " + "11 require linking with `-lc++fs` when using std::filesystem"; +#else + using view_t = Kokkos::View<int, Kokkos::HIP>; + + const Kokkos::HIP exec{}; + + view_t data(Kokkos::view_alloc(exec, "witness")); + + auto graph = Kokkos::Experimental::create_graph(exec); + + auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph); + + root.then_parallel_for(1, Increment<view_t>{data}); + + graph.instantiate(); + + size_t num_nodes; + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipGraphGetNodes(graph.native_graph(), nullptr, &num_nodes)); + + ASSERT_EQ(num_nodes, 2u); + + const auto dot = std::filesystem::temp_directory_path() / "hip_graph.dot"; + + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDebugDotPrint( + graph.native_graph(), dot.c_str(), hipGraphDebugDotFlagsVerbose)); + + ASSERT_TRUE(std::filesystem::exists(dot)); + ASSERT_GT(std::filesystem::file_size(dot), 0u); + + // We could write a check against the full kernel's function signature, but + // it would make the test rely too much on internal implementation details. + // Therefore, we just look for the functor and policy. Note that the + // signature is mangled in the 'dot' output. + const std::string expected("[A-Za-z0-9_]+Increment[A-Za-z0-9_]+RangePolicy"); + + std::stringstream buffer; + buffer << std::ifstream(dot).rdbuf(); + + ASSERT_TRUE(std::regex_search(buffer.str(), std::regex(expected))) + << "Could not find expected signature regex " << std::quoted(expected) + << " in " << dot; +#endif +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp index 14fd4e28837cd6ed690bf006ada187ebe74447cb..9e1347da62cf27d52b9e37d56cfe9b673560c47b 100644 --- a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp @@ -29,198 +29,180 @@ __global__ void test_hip_spaces_int_value(int *ptr) { TEST(hip, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HostSpace>::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HIPHostPinnedSpace>::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HIPSpace>::assignable, - ""); + Kokkos::HIPSpace>::assignable); +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HIPSpace>::accessible, - ""); + Kokkos::HIPSpace>::accessible); +#else + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::HIPSpace>::accessible); +#endif static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HIPManagedSpace>::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HIPManagedSpace>::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HIPSpace>::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HIPHostPinnedSpace>::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HIPHostPinnedSpace>::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HostSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, + Kokkos::HostSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HostSpace>::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, + Kokkos::HostSpace>::accessible); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HIPManagedSpace>::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPSpace, - Kokkos::HIPManagedSpace>::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HIPHostPinnedSpace>::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HostSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HIPSpace>::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HIPSpace>::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HIPManagedSpace>::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPHostPinnedSpace, - Kokkos::HIPManagedSpace>::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HIPManagedSpace>::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HostSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, + Kokkos::HostSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HostSpace>::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HIPSpace>::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HIPSpace>::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HIPHostPinnedSpace>::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess<Kokkos::HIPManagedSpace, - Kokkos::HIPHostPinnedSpace>::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HostSpace>::accessible, - ""); + !Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HIPSpace>::accessible, - ""); + Kokkos::SpaceAccessibility<Kokkos::HIP, Kokkos::HIPSpace>::accessible); static_assert( Kokkos::SpaceAccessibility<Kokkos::HIP, - Kokkos::HIPHostPinnedSpace>::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility<Kokkos::HIP, - Kokkos::HIPManagedSpace>::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility<Kokkos::HIP, + Kokkos::HIPManagedSpace>::accessible); +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace, - Kokkos::HIPSpace>::accessible, - ""); + Kokkos::HIPSpace>::accessible); +#else + static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::HIPSpace>::accessible); +#endif static_assert( Kokkos::SpaceAccessibility<Kokkos::HostSpace, - Kokkos::HIPHostPinnedSpace>::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, - Kokkos::HIPManagedSpace>::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::HIPManagedSpace>::accessible); +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space, + Kokkos::HostSpace>::value); +#else static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::HIPSpace>>::value); +#endif static_assert( std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPHostPinnedSpace>::Space, - Kokkos::HIPHostPinnedSpace>::value, - ""); + Kokkos::HIPHostPinnedSpace>::value); static_assert( std::is_same<Kokkos::Impl::HostMirror<Kokkos::HIPManagedSpace>::Space, Kokkos::Device<Kokkos::HostSpace::execution_space, - Kokkos::HIPManagedSpace>>::value, - ""); + Kokkos::HIPManagedSpace>>::value); static_assert( Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::HIP>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::HIPSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::HIPHostPinnedSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::HIPHostPinnedSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::HIPManagedSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::HIPManagedSpace>::Space, + Kokkos::HostSpace>::accessible); } template <class MemSpace, class ExecSpace> diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_UnifiedMemory_ZeroMemset.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_UnifiedMemory_ZeroMemset.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b67e2ba361c409a71b27446853b274f7c59a2f96 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_UnifiedMemory_ZeroMemset.cpp @@ -0,0 +1,44 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <TestHIP_Category.hpp> +#include <Kokkos_Core.hpp> + +namespace Test { + +// On MI300a with ROCM <= 6.2.0, hipMemsetAsync was failing with an error when +// called on host-allocated buffers. The fix was in PR 7380 to use a +// parallel_for to zero memory +TEST(hip, unified_memory_zero_memset) { +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + GTEST_SKIP() + << "this test should only be run with HIP unified memory enabled"; +#endif + + constexpr size_t N = 1024 * 1024; // size doesn't matter + std::vector<int> v(N, 1); // initialize to non-zero + Kokkos::View<int*, Kokkos::HIPSpace> a(v.data(), N); + + // zero with deep_copy (this is where the error occurs) + Kokkos::deep_copy(a, 0); + + // see if it was zeroed + int err; + Kokkos::parallel_reduce( + N, KOKKOS_LAMBDA(int i, int& lerr) { lerr += (a[i] != 0); }, err); + EXPECT_EQ(err, 0); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp index 25c7138ed3c14ae594e56ec40952f8aee5be018c..e988ed3758db06247a7c8c55af2dc6cd29478c96 100644 --- a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp @@ -62,8 +62,16 @@ struct TestIncrExecSpace { auto concurrency = ExecSpace().concurrency(); ASSERT_GT(concurrency, 0); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif int in_parallel = ExecSpace::in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif ASSERT_FALSE(in_parallel); +#endif const char* name = ExecSpace::name(); std::cout << name << std::endl; diff --git a/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp index 75cf601e782b031481ee42a51a631065fdf2b33f..632df40fc1ec674c0ecad6d8bea470c46dc4e106 100644 --- a/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp @@ -131,7 +131,7 @@ struct TestParallel_For { }; TEST(TEST_CATEGORY, IncrTest_04_simple_parallelFor) { - if (std::is_same<Kokkos::DefaultExecutionSpace, TEST_EXECSPACE>::value) { + if (std::is_same_v<Kokkos::DefaultExecutionSpace, TEST_EXECSPACE>) { TestParallel_For<TEST_EXECSPACE> test; test.simple_test(); } diff --git a/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp index ed22c22d709f18a38480f551352de1f538cef5f9..b120fe5aa5503ea6285383f535ed78c2aafeda07 100644 --- a/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp @@ -43,10 +43,10 @@ struct NonTrivialReduceFunctor { UpdateSum += (i + 1) * value; } - NonTrivialReduceFunctor() = default; - NonTrivialReduceFunctor(NonTrivialReduceFunctor const &) = default; - NonTrivialReduceFunctor(NonTrivialReduceFunctor &&) = default; - NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default; + NonTrivialReduceFunctor() = default; + NonTrivialReduceFunctor(NonTrivialReduceFunctor const &) = default; + NonTrivialReduceFunctor(NonTrivialReduceFunctor &&) = default; + NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default; NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor const &) = default; // Also make sure that it's OK if the destructor is not device-callable. ~NonTrivialReduceFunctor() {} diff --git a/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp b/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp index 1f3370b6ebd422609ae35841e404e13a693562e5..bca0998e2a350914e28cd6b6a843dc0b58a132df 100644 --- a/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp @@ -56,8 +56,8 @@ struct HierarchicalBasics { Kokkos::fence(); auto h_v = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); - size_t check = 0; - size_t ref = nP * nT; + int check = 0; + int ref = nP * nT; for (int i = 0; i < nP; ++i) for (int j = 0; j < nT; ++j) check += h_v(i, j); diff --git a/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp b/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp index ec15e245e7ad596db7433e87e06517a68eb770a3..54291a76a58933d66b793146ab0ef86168f32727 100644 --- a/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp @@ -54,7 +54,7 @@ struct Hierarchical_ForLoop_A { auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); long long int check = 0; - const long long int s = sY * sX; + const long long int s = static_cast<long long int>(sY) * sX; for (int i = 0; i < sX; ++i) for (int j = 0; j < sY; ++j) check += v_H(i, j); ASSERT_EQ(check, s * (s - 1) / 2); diff --git a/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp index 07b3dd2da1b32ebd4b0e3090afc6b87d209d8cd0..112a0982627fd7f308076048be0fa7c4151c6988 100644 --- a/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp @@ -54,7 +54,7 @@ struct Hierarchical_ForLoop_B { auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); long long int check = 0; - const long long int s = sY * sX; + const long long int s = static_cast<long long int>(sY) * sX; for (int i = 0; i < sX; ++i) for (int j = 0; j < sY; ++j) check += v_H(i, j); ASSERT_EQ(check, s * (s - 1) / 2); diff --git a/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp index caa7087df41b51c3f6c18c218965f2862425caca..c454c430ac4d8a6e0d99d883dd81781211b395d8 100644 --- a/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp @@ -59,7 +59,7 @@ struct Hierarchical_ForLoop_C { auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); size_t check = 0; - const size_t s = sX * sY * sZ; + const size_t s = static_cast<size_t>(sX) * sY * sZ; for (int i = 0; i < sX; ++i) for (int j = 0; j < sY; ++j) for (int k = 0; k < sZ; ++k) check += v_H(i, j, k); diff --git a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp index 8c97043f30087e78f18783a05e0ac6cb591e3bf3..4a3cc78cd924665aeb836303b9697f71d8e605e3 100644 --- a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp @@ -75,7 +75,8 @@ struct ThreadScratch { .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize)); int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag()); - v = data_t("Matrix", pN, max_team_size); + ASSERT_GT(max_team_size, 0); + v = data_t("Matrix", pN, max_team_size); Kokkos::parallel_for( "Test12a_ThreadScratch", @@ -87,7 +88,7 @@ struct ThreadScratch { auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); size_t check = 0; - const size_t s = pN * sX * sY; + const size_t s = static_cast<size_t>(pN) * sX * sY; for (int n = 0; n < pN; ++n) for (int m = 0; m < max_team_size; ++m) { check += v_H(n, m); @@ -96,12 +97,14 @@ struct ThreadScratch { } }; +KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) { - ThreadScratch<TEST_EXECSPACE> test; #ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the " "OpenACC backend"; #endif + + ThreadScratch<TEST_EXECSPACE> test; // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to // pass in the Release and RelWithDebInfo builds. Does not need the team_size // to be a multiple of 32 for the Debug builds. @@ -115,5 +118,6 @@ TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) { test.run(14, 277, 321); #endif } +KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() } // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp index 0ebb5c50fbce40c0a92b96050d3952e4640d9f98..739b8f58025d72118c8a730313643504d5b9acfb 100644 --- a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp @@ -79,19 +79,21 @@ struct TeamScratch { auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); size_t check = 0; - const size_t s = pN * sX * sY; + const size_t s = static_cast<size_t>(pN) * sX * sY; for (int n = 0; n < pN; ++n) for (int m = 0; m < sX; ++m) check += v_H(n, m); ASSERT_EQ(check, s * (s - 1) / 2); } }; +KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() TEST(TEST_CATEGORY, IncrTest_12b_TeamScratch) { - TeamScratch<TEST_EXECSPACE> test; #ifdef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC GTEST_SKIP() << "skipping since scratch memory is not yet implemented in the " "OpenACC backend"; #endif + + TeamScratch<TEST_EXECSPACE> test; // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to // pass in the Release and RelWithDebInfo builds. Does not need the team_size // to be a multiple of 32 for the Debug builds. @@ -105,5 +107,6 @@ TEST(TEST_CATEGORY, IncrTest_12b_TeamScratch) { test.run(14, 277, 321); #endif } +KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() } // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp index 32a37013cf62805ade0ee88afdc244bf47d84d15..e61fcaec25c82806ece79aff46b0ddf0434136cc 100644 --- a/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp @@ -48,7 +48,7 @@ struct Hierarchical_Red_C { Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, sY), [=](const int k, int &tmp_inner) { - tmp_inner += n * sX * v.extent(0) + sX * i + k; + tmp_inner += n * sX * v.extent_int(0) + sX * i + k; }, out_inner); diff --git a/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp b/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp index efcb19a5c6b654c4bdca9a245674456d3a165768..232122e641fa4a7e0b6bae8f0dd8638fa7c49939 100644 --- a/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp @@ -53,9 +53,9 @@ struct NonTrivialScanFunctor { NonTrivialScanFunctor(const Kokkos::View<value_type *, ExecSpace> &data) : d_data(data) {} - NonTrivialScanFunctor(NonTrivialScanFunctor const &) = default; - NonTrivialScanFunctor(NonTrivialScanFunctor &&) = default; - NonTrivialScanFunctor &operator=(NonTrivialScanFunctor &&) = default; + NonTrivialScanFunctor(NonTrivialScanFunctor const &) = default; + NonTrivialScanFunctor(NonTrivialScanFunctor &&) = default; + NonTrivialScanFunctor &operator=(NonTrivialScanFunctor &&) = default; NonTrivialScanFunctor &operator=(NonTrivialScanFunctor const &) = default; // Also make sure that it's OK if the destructor is not device-callable. ~NonTrivialScanFunctor() {} diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp deleted file mode 100644 index 22c8ab1bf8fdc8ea888d95ac174776da39a70017..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <TestOpenMP_Category.hpp> -#include <TestGraph.hpp> diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp deleted file mode 100644 index 92b8032bf0c44eed543e974e0d8e706d4a50d9f5..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <TestOpenMP_Category.hpp> -#include <Kokkos_Core.hpp> - -#include <mutex> - -namespace Test { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -TEST(openmp, partition_master) { - using Mutex = Kokkos::Experimental::MasterLock<Kokkos::OpenMP>; - - Mutex mtx; - int errors = 0; - - auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { - const int pool_size = Kokkos::OpenMP().impl_thread_pool_size(); - - { - std::unique_lock<Mutex> lock(mtx); - if (Kokkos::OpenMP::in_parallel()) { - ++errors; - } - if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) { - ++errors; - } - } - - { - int local_errors = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000), - [pool_size](const int, int& errs) { - if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) { - ++errs; - } - }, - local_errors); - Kokkos::atomic_add(&errors, local_errors); - } - - Kokkos::Experimental::UniqueToken<Kokkos::OpenMP> token; - - Kokkos::View<int*, Kokkos::OpenMP> count("", token.size()); - - Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000), - [=](const int) { - int i = token.acquire(); - ++count[i]; - token.release(i); - }); - - Kokkos::View<int, Kokkos::OpenMP> sum(""); - Kokkos::parallel_for( - Kokkos::RangePolicy<Kokkos::OpenMP>(0, token.size()), - [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); }); - - if (sum() != 1000) { - Kokkos::atomic_add(&errors, 1); - } - }; - - master(0, 1); - - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 4, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 4); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 2, 2); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 8); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 8); - ASSERT_EQ(errors, 0); -} -#endif - -} // namespace Test diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp deleted file mode 100644 index bff64d83e276e874e6b9d7e33031fc1ce46f5619..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <TestSerial_Category.hpp> -#include <TestGraph.hpp> diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Graph.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f28689348471c418449395f89602e7b617eb8a8 --- /dev/null +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Graph.cpp @@ -0,0 +1,114 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <filesystem> +#include <fstream> +#include <regex> + +#include <TestSYCL_Category.hpp> +#include <Kokkos_Core.hpp> +#include <Kokkos_Graph.hpp> + +#include <gtest/gtest.h> + +namespace { + +template <typename ViewType> +struct Increment { + ViewType data; + + KOKKOS_FUNCTION + void operator()(const int) const { ++data(); } +}; + +TEST(TEST_CATEGORY, graph_get_native_return_types_are_references) { + using graph_t = Kokkos::Experimental::Graph<Kokkos::SYCL>; + static_assert( + std::is_reference_v<decltype(std::declval<graph_t>().native_graph())>); + static_assert(std::is_reference_v< + decltype(std::declval<graph_t>().native_graph_exec())>); +} + +// This test checks the promises of Kokkos::Graph against its +// underlying SYCL native objects. +TEST(TEST_CATEGORY, graph_promises_on_native_objects) { + auto graph = Kokkos::Experimental::create_graph<Kokkos::SYCL>(); + + auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph); + + // Before instantiation, the SYCL graph is valid, but the SYCL executable + // graph is still null. Since the SYCL command graph is a regular object, + // no check is needed. + // However, the executable SYCL command graph is stored as an optional, + // so let's check it is empty for now. + ASSERT_FALSE(graph.native_graph_exec().has_value()); + + // After instantiation, both native objects are valid. + graph.instantiate(); + + ASSERT_TRUE(graph.native_graph_exec().has_value()); +} + +// Use native SYCL graph to generate a DOT representation. +TEST(TEST_CATEGORY, graph_instantiate_and_debug_dot_print) { + using view_t = Kokkos::View<int, Kokkos::SYCL>; + + const Kokkos::SYCL exec{}; + + view_t data(Kokkos::view_alloc(exec, "witness")); + + auto graph = Kokkos::Experimental::create_graph(exec); + + auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph); + + root.then_parallel_for(1, Increment<view_t>{data}); + + graph.instantiate(); + + ASSERT_EQ(graph.native_graph().get_nodes().size(), 2u); + +#if defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 9 + GTEST_SKIP() + << "The GNU C++ Library (libstdc++) versions less than 9.1 " + "require linking with `-lstdc++fs` when using std::filesystem"; +#elif defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 110000 + GTEST_SKIP() + << "The LLVM C++ Standard Library (libc++) versions less than " + "11 require linking with `-lc++fs` when using std::filesystem"; +#else + const auto dot = std::filesystem::temp_directory_path() / "sycl_graph.dot"; + + graph.native_graph().print_graph(dot, true); + + ASSERT_TRUE(std::filesystem::exists(dot)); + ASSERT_GT(std::filesystem::file_size(dot), 0u); + + // We could write a check against the full kernel's function signature, but + // it would make the test rely too much on internal implementation details. + // Therefore, we just look for the functor and policy. Note that the + // signature is mangled in the 'dot' output. + const std::string expected("[A-Za-z0-9_]+Increment[A-Za-z0-9_]+RangePolicy"); + + std::stringstream buffer; + buffer << std::ifstream(dot).rdbuf(); + + ASSERT_TRUE(std::regex_search(buffer.str(), std::regex(expected))) + << "Could not find expected signature regex " << std::quoted(expected) + << " in " << dot; +#endif +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp index 8b6f08c14ad634f2a9dc269ba6e7d85e577dc915..82ec62f0e4a88922f9b57329ff0248d9bc09f780 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp @@ -26,7 +26,7 @@ namespace Test { TEST(sycl, raw_sycl_interop) { // Make sure all queues use the same context Kokkos::initialize(); - Kokkos::Experimental::SYCL default_space; + Kokkos::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); sycl::queue queue(default_context, sycl::default_selector_v, diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp index 4811fb6d976544610ad10c13aa8cfc78da7d4b7c..abf8449705eff1e5f3702d4cf63da5056eba23b9 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp @@ -24,7 +24,7 @@ namespace Test { // Test whether external allocations can be accessed by the default queue. TEST(sycl, raw_sycl_interop_context_1) { // Make sure all queues use the same context - Kokkos::Experimental::SYCL default_space; + Kokkos::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); sycl::queue queue(default_context, sycl::default_selector_v, @@ -32,7 +32,7 @@ TEST(sycl, raw_sycl_interop_context_1) { constexpr int n = 100; int* p = sycl::malloc_device<int>(n, queue); - Kokkos::Experimental::SYCL space(queue); + Kokkos::SYCL space(queue); Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n); Kokkos::deep_copy(v, 5); @@ -58,16 +58,15 @@ TEST(sycl, raw_sycl_interop_context_1) { // Test whether regular View allocations can be accessed by non-default queues. TEST(sycl, raw_sycl_interop_context_2) { - Kokkos::Experimental::SYCL default_space; + Kokkos::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); sycl::queue queue(default_context, sycl::default_selector_v, sycl::property::queue::in_order()); constexpr int n = 100; - Kokkos::Experimental::SYCL space(queue); - Kokkos::View<int*, Kokkos::Experimental::SYCLDeviceUSMSpace> v("default_view", - n); + Kokkos::SYCL space(queue); + Kokkos::View<int*, Kokkos::SYCLDeviceUSMSpace> v("default_view", n); Kokkos::deep_copy(space, v, 5); auto* v_ptr = v.data(); diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp index c0070adb0cb2a400441dc2bd571b5f2142d2403e..5ac5d4e734d98d0884c28f54590e7f4aba666c8c 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp @@ -22,7 +22,7 @@ namespace Test { TEST(sycl, raw_sycl_queues) { // Make sure all queues use the same context Kokkos::initialize(); - Kokkos::Experimental::SYCL default_space; + Kokkos::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); sycl::queue queue(default_context, sycl::default_selector_v, diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d3906e409f5ab7d56179da5a88c7bbd1b6dfa6a9 --- /dev/null +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <TestSYCL_Category.hpp> +#include <TestMultiGPU.hpp> + +namespace { + +std::array<TEST_EXECSPACE, 2> get_execution_spaces() { + std::vector<sycl::device> gpu_devices = + sycl::device::get_devices(sycl::info::device_type::gpu); + + TEST_EXECSPACE exec0( + sycl::queue{gpu_devices.front(), sycl::property::queue::in_order()}); + TEST_EXECSPACE exec1( + sycl::queue{gpu_devices.back(), sycl::property::queue::in_order()}); + + return {exec0, exec1}; +} + +TEST(sycl_multi_gpu, managed_views) { + std::array<TEST_EXECSPACE, 2> execs = get_execution_spaces(); + + Kokkos::View<int *, TEST_EXECSPACE> view0(Kokkos::view_alloc("v0", execs[0]), + 100); + Kokkos::View<int *, TEST_EXECSPACE> view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); +} + +TEST(sycl_multi_gpu, unmanaged_views) { + std::array<TEST_EXECSPACE, 2> execs = get_execution_spaces(); + + int *p0 = sycl::malloc_device<int>(100, execs[0].sycl_queue()); + Kokkos::View<int *, TEST_EXECSPACE> view0(p0, 100); + + int *p1 = sycl::malloc_device<int>(100, execs[1].sycl_queue()); + Kokkos::View<int *, TEST_EXECSPACE> view1(p1, 100); + + test_policies(execs[0], view0, execs[1], view1); + sycl::free(p0, execs[0].sycl_queue()); + sycl::free(p1, execs[1].sycl_queue()); +} + +TEST(sycl_multi_gpu, scratch_space) { + std::array<TEST_EXECSPACE, 2> execs = get_execution_spaces(); + + test_scratch(execs[0], execs[1]); +} +} // namespace diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp index 914f8432488db3f82ffed012237b0473263de96b..e85aaa4f812e36964903811bd6013c13d13f060d 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp @@ -21,257 +21,204 @@ namespace Test { TEST(sycl, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, - Kokkos::HostSpace>::assignable, - ""); - - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); - - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace>::assignable); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::SYCLHostUSMSpace>::assignable); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::SYCLDeviceUSMSpace>::assignable); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::SYCLDeviceUSMSpace>::accessible); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::SYCLSharedUSMSpace>::assignable); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::SYCLSharedUSMSpace>::accessible); //-------------------------------------- - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::assignable); - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLSharedUSMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLHostUSMSpace>::assignable); - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::SYCLHostUSMSpace>::accessible); static_assert( - !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::HostSpace>::assignable, - ""); + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::HostSpace>::assignable); static_assert( - !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::HostSpace>::accessible, - ""); + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLDeviceUSMSpace, + Kokkos::HostSpace>::accessible); //-------------------------------------- - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLSharedUSMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::assignable); - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::accessible); static_assert( - !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::HostSpace>::assignable, - ""); + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::HostSpace>::assignable); static_assert( - !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::HostSpace>::accessible, - ""); + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLHostUSMSpace>::assignable); - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLSharedUSMSpace, + Kokkos::SYCLHostUSMSpace>::accessible); //-------------------------------------- - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); - static_assert( - !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::HostSpace>::assignable, - ""); + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::SYCLHostUSMSpace>::assignable); static_assert( - Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::HostSpace>::accessible, - ""); + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::HostSpace>::assignable); + + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::SYCLDeviceUSMSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::SYCLSharedUSMSpace>::assignable); - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::SYCLSharedUSMSpace>::accessible); //-------------------------------------- - static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::SYCL, - Kokkos::HostSpace>::accessible, - ""); - - static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + static_assert( + !Kokkos::SpaceAccessibility<Kokkos::SYCL, Kokkos::HostSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility<Kokkos::SYCL, + Kokkos::SYCLDeviceUSMSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility<Kokkos::SYCL, + Kokkos::SYCLSharedUSMSpace>::accessible); - static_assert(!Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility<Kokkos::SYCL, + Kokkos::SYCLHostUSMSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + static_assert( + !Kokkos::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::SYCLDeviceUSMSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::SYCLSharedUSMSpace>::accessible); static_assert( - std::is_same<Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::SYCLHostUSMSpace>::accessible); static_assert( - std::is_same< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::Device<Kokkos::HostSpace::execution_space, - Kokkos::Experimental::SYCLSharedUSMSpace>>::value, - ""); + std::is_same<Kokkos::Impl::HostMirror<Kokkos::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::value); static_assert( - Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::HostSpace>::accessible, - ""); + std::is_same<Kokkos::Impl::HostMirror<Kokkos::SYCLSharedUSMSpace>::Space, + Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::SYCLSharedUSMSpace>>::value); - static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::SYCLHostUSMSpace, + Kokkos::HostSpace>::accessible); - static_assert(std::is_same<Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::Experimental::SYCLHostUSMSpace>::value, - ""); + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::SYCLHostUSMSpace>::accessible); static_assert( - std::is_same< - Kokkos::Device<Kokkos::HostSpace::execution_space, - Kokkos::Experimental::SYCLSharedUSMSpace>, - Kokkos::Device<Kokkos::HostSpace::execution_space, - Kokkos::Experimental::SYCLSharedUSMSpace>>::value, - ""); + std::is_same<Kokkos::Impl::HostMirror<Kokkos::SYCLHostUSMSpace>::Space, + Kokkos::SYCLHostUSMSpace>::value); - static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert( + std::is_same<Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::SYCLSharedUSMSpace>, + Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::SYCLSharedUSMSpace>>::value); + + static_assert( + Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::SYCL>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::SYCLSharedUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror<Kokkos::SYCLHostUSMSpace>::Space, + Kokkos::HostSpace>::accessible); } TEST(sycl, uvm) { - int *uvm_ptr = static_cast<int *>( - Kokkos::kokkos_malloc<Kokkos::Experimental::SYCLSharedUSMSpace>( + int *uvm_ptr = + static_cast<int *>(Kokkos::kokkos_malloc<Kokkos::SYCLSharedUSMSpace>( "uvm_ptr", sizeof(int))); *uvm_ptr = 42; - Kokkos::Experimental::SYCL().fence(); + Kokkos::SYCL().fence(); Kokkos::parallel_for( - Kokkos::RangePolicy<Kokkos::Experimental::SYCL>(0, 1), - KOKKOS_LAMBDA(int) { + Kokkos::RangePolicy<Kokkos::SYCL>(0, 1), KOKKOS_LAMBDA(int) { if (*uvm_ptr == 42) { *uvm_ptr = 2 * 42; } }); - Kokkos::Experimental::SYCL().fence(); + Kokkos::SYCL().fence(); EXPECT_EQ(*uvm_ptr, int(2 * 42)); - Kokkos::kokkos_free<Kokkos::Experimental::SYCLSharedUSMSpace>(uvm_ptr); + Kokkos::kokkos_free<Kokkos::SYCLSharedUSMSpace>(uvm_ptr); } template <class MemSpace, class ExecSpace> @@ -311,17 +258,14 @@ struct TestViewSYCLAccessible { }; TEST(sycl, impl_view_accessible) { - TestViewSYCLAccessible<Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCL>::run(); + TestViewSYCLAccessible<Kokkos::SYCLDeviceUSMSpace, Kokkos::SYCL>::run(); - TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCL>::run(); - TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace, + TestViewSYCLAccessible<Kokkos::SYCLSharedUSMSpace, Kokkos::SYCL>::run(); + TestViewSYCLAccessible<Kokkos::SYCLSharedUSMSpace, Kokkos::HostSpace::execution_space>::run(); - TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCL>::run(); - TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace, + TestViewSYCLAccessible<Kokkos::SYCLHostUSMSpace, Kokkos::SYCL>::run(); + TestViewSYCLAccessible<Kokkos::SYCLHostUSMSpace, Kokkos::HostSpace::execution_space>::run(); } diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp deleted file mode 100644 index 3c599b95a6f3362c1d8c33de69315ef8cfd3df44..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include <TestSYCL_Category.hpp> -#include <TestTaskScheduler.hpp> diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp index 9ab89df977a05e073dabb887c41dc72a70b2d503..ec4c05fe453b347d11d23310c5be5dab116f54e4 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp @@ -22,17 +22,16 @@ namespace Test { namespace Impl { struct SYCLQueueScratchTestFunctor { - using team_t = Kokkos::TeamPolicy<Kokkos::Experimental::SYCL>::member_type; - using scratch_t = - Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>; + using team_t = Kokkos::TeamPolicy<Kokkos::SYCL>::member_type; + using scratch_t = Kokkos::View<int64_t*, Kokkos::SYCL::scratch_memory_space>; - Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::View<int64_t, Kokkos::SYCLDeviceUSMSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> counter; int N, M; SYCLQueueScratchTestFunctor( - Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter_, - int N_, int M_) + Kokkos::View<int64_t, Kokkos::SYCLDeviceUSMSpace> counter_, int N_, + int M_) : counter(counter_), N(N_), M(M_) {} KOKKOS_FUNCTION @@ -54,12 +53,11 @@ struct SYCLQueueScratchTestFunctor { void sycl_queue_scratch_test_one( int N, int T, int M_base, - Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter, - Kokkos::Experimental::SYCL sycl, int tid) { + Kokkos::View<int64_t, Kokkos::SYCLDeviceUSMSpace> counter, + Kokkos::SYCL sycl, int tid) { int M = M_base + tid * 5; - Kokkos::TeamPolicy<Kokkos::Experimental::SYCL> p(sycl, T, 64); - using scratch_t = - Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>; + Kokkos::TeamPolicy<Kokkos::SYCL> p(sycl, T, 64); + using scratch_t = Kokkos::View<int64_t*, Kokkos::SYCL::scratch_memory_space>; int bytes = scratch_t::shmem_size(M); @@ -71,19 +69,19 @@ void sycl_queue_scratch_test_one( void sycl_queue_scratch_test( int N, int T, int M_base, - Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter) { + Kokkos::View<int64_t, Kokkos::SYCLDeviceUSMSpace> counter) { constexpr int K = 4; - Kokkos::Experimental::SYCL default_space; + Kokkos::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); sycl::queue queue(default_context, sycl::default_selector_v, sycl::property::queue::in_order()); - std::array<Kokkos::Experimental::SYCL, K> sycl; + std::array<Kokkos::SYCL, K> sycl; for (int i = 0; i < K; i++) { - sycl[i] = Kokkos::Experimental::SYCL( - sycl::queue(default_context, sycl::default_selector_v, - sycl::property::queue::in_order())); + sycl[i] = + Kokkos::SYCL(sycl::queue(default_context, sycl::default_selector_v, + sycl::property::queue::in_order())); } // Test that growing scratch size in subsequent calls doesn't crash things @@ -116,7 +114,7 @@ TEST(sycl, team_scratch_1_queues) { int T = 10; int M_base = 150; - Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter("C"); + Kokkos::View<int64_t, Kokkos::SYCLDeviceUSMSpace> counter("C"); Impl::sycl_queue_scratch_test(N, T, M_base, counter); diff --git a/packages/kokkos/core/unit_test/testmake.sh b/packages/kokkos/core/unit_test/testmake.sh deleted file mode 100755 index b5d4e8874d6bbd632bb7875bb931935018671195..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/testmake.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -if test "`basename $PWD`" = "cmaketest"; then - outfile=$1 -else - outfile=config/tmpstore/$1 -fi - -grep_arch=`grep KOKKOS_ARCH $outfile | grep $2 2>&1` -grep_devs=`grep KOKKOS_DEVICES $outfile | grep $3 2>&1` -if test -n "$grep_arch"; then - if test -n "$grep_devs"; then - echo Passed - else - echo Failed - fi -else - echo Failed -fi diff --git a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp index 3c85f661aaeafcff4016ba831cede90b19cb4b44..0527985ae9f32817528ffe51d6e0b574e9727416 100644 --- a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp +++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp @@ -28,8 +28,8 @@ class OpenMP; class Cuda; class Threads; class HIP; -namespace Experimental { class SYCL; +namespace Experimental { class OpenMPTarget; class HPX; } // namespace Experimental @@ -107,7 +107,7 @@ struct TestScanFunctor { template <typename Lambda> void test_wrapper(const Lambda& lambda) { - if (!std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value) { + if (!std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Serial>) { lambda(); } } @@ -282,8 +282,8 @@ TEST(kokkosp, test_streams) { TEST(kokkosp, async_deep_copy) { // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::OpenMPTarget>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::OpenMPTarget>) GTEST_SKIP() << "skipping since the OpenMPTarget backend has unexpected fences"; #endif @@ -363,8 +363,8 @@ TEST(kokkosp, parallel_reduce) { TEST(kokkosp, parallel_scan) { // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::OpenMPTarget>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::OpenMPTarget>) GTEST_SKIP() << "skipping since the OpenMPTarget backend reports unexpected events"; #endif @@ -391,32 +391,36 @@ TEST(kokkosp, parallel_scan) { TEST(kokkosp, parallel_scan_no_fence) { // FIXME_THREADS #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Threads>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Threads>) GTEST_SKIP() << "skipping since the Thread backend always fences"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::HPX>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::HPX>) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::OpenMPTarget>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::OpenMPTarget>) GTEST_SKIP() << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); auto success = validate_absence( - [=]() { - TestScanFunctor tf; - Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); - }, + [=]() { Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); }, [=](BeginFenceEvent begin_event) { if (begin_event.name.find("Debug Only Check for Execution Error") != std::string::npos || @@ -432,31 +436,37 @@ TEST(kokkosp, parallel_scan_no_fence) { TEST(kokkosp, parallel_scan_no_fence_view) { // FIXME_THREADS #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Threads>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Threads>) GTEST_SKIP() << "skipping since the Thread backend always fences"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::HPX>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::Experimental::HPX>) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::OpenMPTarget>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::OpenMPTarget>) GTEST_SKIP() << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::View<typename TestScanFunctor::value_type> v("scan_result"); + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); - Kokkos::View<typename TestScanFunctor::value_type> v("scan_result"); auto success = validate_absence( [=]() { - TestScanFunctor tf; Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); }, [=](BeginFenceEvent begin_event) { @@ -510,8 +520,8 @@ TEST(kokkosp, fences) { TEST(kokkosp, raw_allocation) { // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::OpenMPTarget>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::OpenMPTarget>) GTEST_SKIP() << "skipping since the OpenMPTarget backend reports unexpected events"; #endif @@ -549,8 +559,8 @@ TEST(kokkosp, raw_allocation) { TEST(kokkosp, view) { // FIXME_OPENMPTARGET #ifdef KOKKOS_ENABLE_OPENMPTARGET - if (std::is_same<Kokkos::DefaultExecutionSpace, - Kokkos::Experimental::OpenMPTarget>::value) + if (std::is_same_v<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::OpenMPTarget>) GTEST_SKIP() << "skipping since the OpenMPTarget backend reports unexpected events"; #endif @@ -729,8 +739,7 @@ TEST(kokkosp, get_events) { Kokkos::Tools::popRegion(); }); for (const auto& ptr : event_vector) { - auto ptr_as_begin = std::dynamic_pointer_cast<BeginParallelForEvent>(ptr); - ASSERT_EQ(ptr_as_begin, nullptr); + ASSERT_FALSE(is_a<BeginParallelForEvent>(ptr)); } } } // namespace Test diff --git a/packages/kokkos/core/unit_test/tools/TestKernelNames.cpp b/packages/kokkos/core/unit_test/tools/TestKernelNames.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b3d455487d92844b0642e4532d38d777636f64e --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/TestKernelNames.cpp @@ -0,0 +1,219 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +#include <gtest/gtest.h> + +namespace { + +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO +template <class T> +std::string typeid_name(T const&) { + return std::string(Kokkos::Impl::TypeInfo<T>::name()); +} +#else +template <class T> +std::string typeid_name(T const&) { + return typeid(T).name(); +} +#endif + +std::string last_parallel_for; +std::string last_parallel_reduce; +std::string last_parallel_scan; + +void get_parallel_for_kernel_name(char const* kernelName, uint32_t /*deviceID*/, + uint64_t* /*kernelID*/) { + last_parallel_for = kernelName; +} + +void get_parallel_reduce_kernel_name(char const* kernelName, + uint32_t /*deviceID*/, + uint64_t* /*kernelID*/) { + last_parallel_reduce = kernelName; +} + +void get_parallel_scan_kernel_name(char const* kernelName, + uint32_t /*deviceID*/, + uint64_t* /*kernelID*/) { + last_parallel_scan = kernelName; +} + +struct WorkTag {}; + +void test_kernel_name_parallel_for() { + Kokkos::Tools::Experimental::set_begin_parallel_for_callback( + get_parallel_for_kernel_name); + + using ExecutionSpace = Kokkos::DefaultExecutionSpace; + + { + std::string const my_label = "my_parallel_for_range_policy"; + + auto const my_lambda = KOKKOS_LAMBDA(int){}; + Kokkos::parallel_for(my_label, Kokkos::RangePolicy<ExecutionSpace>(0, 1), + my_lambda); + ASSERT_EQ(last_parallel_for, my_label); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, 1), my_lambda); + ASSERT_EQ(last_parallel_for, typeid_name(my_lambda)); +#ifndef KOKKOS_ENABLE_CXX17 + ASSERT_FALSE(last_parallel_for.starts_with("const ")) + << last_parallel_for << " is const-qualified"; +#endif + + auto const my_lambda_with_tag = KOKKOS_LAMBDA(WorkTag, int){}; + Kokkos::parallel_for(my_label, + Kokkos::RangePolicy<ExecutionSpace, WorkTag>(0, 1), + my_lambda_with_tag); + ASSERT_EQ(last_parallel_for, my_label); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace, WorkTag>(0, 1), + my_lambda_with_tag); + ASSERT_EQ(last_parallel_for, + typeid_name(my_lambda_with_tag) + "/" + typeid_name(WorkTag{})); +#ifndef KOKKOS_ENABLE_CXX17 + ASSERT_FALSE(last_parallel_for.starts_with("const ")) + << last_parallel_for << " is const-qualified"; +#endif + } + + Kokkos::Tools::Experimental::set_begin_parallel_for_callback(nullptr); +} + +void test_kernel_name_parallel_reduce() { + Kokkos::Tools::Experimental::set_begin_parallel_reduce_callback( + get_parallel_reduce_kernel_name); + + using ExecutionSpace = Kokkos::DefaultExecutionSpace; + + { + std::string const my_label = "my_parallel_reduce_range_policy"; + float my_result; + + auto const my_lambda = KOKKOS_LAMBDA(int, float&){}; + Kokkos::parallel_reduce(my_label, Kokkos::RangePolicy<ExecutionSpace>(0, 1), + my_lambda, my_result); + ASSERT_EQ(last_parallel_reduce, my_label); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, 1), + my_lambda, my_result); +#ifndef KOKKOS_COMPILER_MSVC + ASSERT_NE(last_parallel_reduce.find(typeid_name(my_lambda)), + std::string::npos) + << last_parallel_reduce << " does not contain " + << typeid_name( + my_lambda); // internally using Impl::CombinedFunctorReducer + // but the name should still include the lambda as + // template parameter +#endif +#ifndef KOKKOS_ENABLE_CXX17 + ASSERT_FALSE(last_parallel_reduce.starts_with("const ")) + << last_parallel_reduce << " is const-qualified"; +#endif + + auto const my_lambda_with_tag = KOKKOS_LAMBDA(WorkTag, int, float&){}; + Kokkos::parallel_reduce(my_label, + Kokkos::RangePolicy<ExecutionSpace, WorkTag>(0, 1), + my_lambda_with_tag, my_result); + ASSERT_EQ(last_parallel_reduce, my_label); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace, WorkTag>(0, 1), + my_lambda_with_tag, my_result); + auto const suffix = std::string("/") + typeid_name(WorkTag{}); + ASSERT_EQ(last_parallel_reduce.find(suffix), + last_parallel_reduce.length() - suffix.length()); +#ifndef KOKKOS_ENABLE_CXX17 + ASSERT_FALSE(last_parallel_reduce.starts_with("const ")) + << last_parallel_reduce << " is const-qualified"; +#endif + } + + Kokkos::Tools::Experimental::set_begin_parallel_reduce_callback(nullptr); +} + +void test_kernel_name_parallel_scan() { + Kokkos::Tools::Experimental::set_begin_parallel_scan_callback( + get_parallel_scan_kernel_name); + + using ExecutionSpace = Kokkos::DefaultExecutionSpace; + + { + std::string const my_label = "my_parallel_scan_range_policy"; + + auto const my_lambda = KOKKOS_LAMBDA(int, float&, bool){}; + Kokkos::parallel_scan(my_label, Kokkos::RangePolicy<ExecutionSpace>(0, 1), + my_lambda); + ASSERT_EQ(last_parallel_scan, my_label); + + Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace>(0, 1), my_lambda); + ASSERT_EQ(last_parallel_scan, typeid_name(my_lambda)); +#ifndef KOKKOS_ENABLE_CXX17 + ASSERT_FALSE(last_parallel_scan.starts_with("const ")) + << last_parallel_scan << " is const-qualified"; +#endif + + auto const my_lambda_with_tag = KOKKOS_LAMBDA(WorkTag, int, float&, bool){}; + Kokkos::parallel_scan(my_label, + Kokkos::RangePolicy<ExecutionSpace, WorkTag>(0, 1), + my_lambda_with_tag); + ASSERT_EQ(last_parallel_scan, my_label); + + Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace, WorkTag>(0, 1), + my_lambda_with_tag); + ASSERT_EQ(last_parallel_scan, + typeid_name(my_lambda_with_tag) + "/" + typeid_name(WorkTag{})); +#ifndef KOKKOS_ENABLE_CXX17 + ASSERT_FALSE(last_parallel_scan.starts_with("const ")) + << last_parallel_scan << " is const-qualified"; +#endif + } + + Kokkos::Tools::Experimental::set_begin_parallel_scan_callback(nullptr); +} + +TEST(kokkosp, kernel_name_parallel_for) { test_kernel_name_parallel_for(); } + +TEST(kokkosp, kernel_name_parallel_reduce) { + test_kernel_name_parallel_reduce(); +} + +TEST(kokkosp, kernel_name_parallel_scan) { test_kernel_name_parallel_scan(); } + +TEST(kokkosp, kernel_name_internal) { + struct ThisType {}; + { + std::string const label("my_label"); + Kokkos::Impl::ParallelConstructName<ThisType, void> pcn(label); + ASSERT_EQ(pcn.get(), label); + std::string const empty_label(""); + Kokkos::Impl::ParallelConstructName<ThisType, void> empty_pcn(empty_label); + ASSERT_EQ(empty_pcn.get(), typeid_name(ThisType{})); + } + { + std::string const label("my_label"); + Kokkos::Impl::ParallelConstructName<ThisType, WorkTag> pcn(label); + ASSERT_EQ(pcn.get(), label); + std::string const empty_label(""); + Kokkos::Impl::ParallelConstructName<ThisType, WorkTag> empty_pcn( + empty_label); + ASSERT_EQ(empty_pcn.get(), + typeid_name(ThisType{}) + "/" + typeid_name(WorkTag{})); + } +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp deleted file mode 100644 index 4e56f8996a03e2da0821b5b083b20285bbe8d7f8..0000000000000000000000000000000000000000 --- a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp +++ /dev/null @@ -1,177 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#include <iostream> -#include <gtest/gtest.h> -#include "Kokkos_Core.hpp" - -#include <impl/Kokkos_Stacktrace.hpp> - -namespace Test { - -void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} -void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} - -void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*, - const void*, const uint64_t) { - ASSERT_TRUE(false) << "Unexpected memory event"; -} - -void expect_no_events() { - Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event); - Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event); -} - -std::string expected_view_name; -std::string expected_space_name; -std::string error_message; -void expect_allocation_event(const std::string evn, const std::string esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_allocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} -void expect_deallocation_event(const std::string& evn, const std::string& esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_deallocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} - -struct TestSpaceNamer { - static constexpr const char* get_name() { return "TestSpace"; } -}; -struct TestSpaceNamerTwo { - static constexpr const char* get_name() { return "YoDawg"; } -}; -struct TestSpaceNamerThree { - static constexpr const char* get_name() { return "CustomAccessSpace"; } -}; -using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - -void test_view_construct() { - { - expect_allocation_event("puppy_view", "TestSpace", "View allocation"); - Kokkos::View<double*, fake_memory_space> pup_view("puppy_view", 1000); - expect_deallocation_event("puppy_view", "TestSpace", "View free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_malloc_free() { - expect_allocation_event("does_malloc_work", "TestSpace", - "Error in malloc event"); - auto* temp = - Kokkos::kokkos_malloc<fake_memory_space>("does_malloc_work", 1000); - expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free"); - Kokkos::kokkos_free<fake_memory_space>(temp); - Kokkos::Tools::Experimental::pause_tools(); -} -void test_chained_spaces() { - using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - { - expect_allocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space view allocation"); - Kokkos::View<double*, doubly_fake_memory_space> pup_view("xzibit_dot_jpeg", - 1000); - expect_deallocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_space_allocations() { - fake_memory_space debug_space; - expect_allocation_event("allocation_from_space", "TestSpace", - "Space allocation"); - auto* temp = debug_space.allocate("allocation_from_space", 1000); - expect_deallocation_event("allocation_from_space", "TestSpace", - "Space deallocation"); - debug_space.deallocate("allocation_from_space", temp, 1000); - Kokkos::Tools::Experimental::pause_tools(); -} -template <typename Space> -struct AccessCheckKernel { - Kokkos::View<double*, Space> data; - KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; } -}; - -template <typename Space> -void test_allowed_access() { - constexpr const int data_size = 1000; - // We use an unmananged View here since we want to detect a memory access - // violation in the parallel_for and not in the initialization of the View. - std::vector<double> test_data(data_size); - Kokkos::View<double*, Space> test_view(test_data.data(), data_size); - AccessCheckKernel<Space> functor{test_view}; - Kokkos::parallel_for( - "access_allowed", - Kokkos::RangePolicy<typename Space::execution_space>(0, data_size), - functor); - Kokkos::fence(); -} - -using semantically_independent_logical_space = - Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, - TestSpaceNamerThree, - Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>; - -TEST(defaultdevicetype, logical_space_views) { test_view_construct(); } -TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); } -TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); } -TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); } -TEST(defaultdevicetype, access_allowed) { - test_allowed_access<fake_memory_space>(); -} -// FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) -TEST(defaultdevicetype_DeathTest, access_forbidden) { - ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { test_allowed_access<semantically_independent_logical_space>(); }, - "Kokkos::View ERROR: attempt to access inaccessible memory space"); -} -#endif - -} // namespace Test diff --git a/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp b/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp index 318766ac455fe9db899e7b9f984eac984a4ea0e5..64943958da1540def3fa06a50135abc95ba64938 100644 --- a/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp +++ b/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp @@ -57,7 +57,7 @@ void kokkosp_test_destroy_section(std::uint32_t id) { } // namespace -TEST(defaultdevicetype, profiling_section) { +TEST(kokkosp, profiling_section) { Kokkos::Profiling::Experimental::set_create_profile_section_callback( kokkosp_test_create_section); Kokkos::Profiling::Experimental::set_destroy_profile_section_callback( @@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) { } using Kokkos::Profiling::ProfilingSection; -static_assert(!std::is_default_constructible<ProfilingSection>::value, ""); -static_assert(!std::is_copy_constructible<ProfilingSection>::value, ""); -static_assert(!std::is_move_constructible<ProfilingSection>::value, ""); -static_assert(!std::is_copy_assignable<ProfilingSection>::value, ""); -static_assert(!std::is_move_assignable<ProfilingSection>::value, ""); +static_assert(!std::is_default_constructible_v<ProfilingSection>); +static_assert(!std::is_copy_constructible_v<ProfilingSection>); +static_assert(!std::is_move_constructible_v<ProfilingSection>); +static_assert(!std::is_copy_assignable_v<ProfilingSection>); +static_assert(!std::is_move_assignable_v<ProfilingSection>); diff --git a/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp b/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp index 5306496d764d7b3b7038a329d81c46bc3eea86f7..3d39e106dfea84a074e5ad9f6c5eb348ddb5d29d 100644 --- a/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp +++ b/packages/kokkos/core/unit_test/tools/TestScopedRegion.cpp @@ -31,7 +31,7 @@ void test_push_region(char const *label) { test_region_stack.push(label); } void test_pop_region() { test_region_stack.pop(); } -TEST(defaultdevicetype, scoped_profile_region) { +TEST(kokkosp, scoped_profile_region) { Kokkos::Tools::Experimental::set_push_region_callback(test_push_region); Kokkos::Tools::Experimental::set_pop_region_callback(test_pop_region); @@ -63,10 +63,10 @@ TEST(defaultdevicetype, scoped_profile_region) { } using Kokkos::Profiling::ScopedRegion; -static_assert(!std::is_default_constructible<ScopedRegion>::value); -static_assert(!std::is_copy_constructible<ScopedRegion>::value); -static_assert(!std::is_move_constructible<ScopedRegion>::value); -static_assert(!std::is_copy_assignable<ScopedRegion>::value); -static_assert(!std::is_move_assignable<ScopedRegion>::value); +static_assert(!std::is_default_constructible_v<ScopedRegion>); +static_assert(!std::is_copy_constructible_v<ScopedRegion>); +static_assert(!std::is_move_constructible_v<ScopedRegion>); +static_assert(!std::is_copy_assignable_v<ScopedRegion>); +static_assert(!std::is_move_assignable_v<ScopedRegion>); } // namespace diff --git a/packages/kokkos/core/unit_test/tools/TestTuning.cpp b/packages/kokkos/core/unit_test/tools/TestTuning.cpp index 37dc931aa10b22995b8d14f97fc62343690066b9..c56d2f060d3ad159841379bad14673ca99da658f 100644 --- a/packages/kokkos/core/unit_test/tools/TestTuning.cpp +++ b/packages/kokkos/core/unit_test/tools/TestTuning.cpp @@ -61,7 +61,7 @@ int main() { Kokkos::Tools::Experimental::VariableInfo* info) { if (info->type != Kokkos::Tools::Experimental::ValueType::kokkos_value_int64) { - throw(std::runtime_error("Tuning Variable has wrong type")); + Kokkos::abort("Tuning Variable has wrong type"); } }); Kokkos::Tools::Experimental::set_declare_input_type_callback( @@ -69,7 +69,7 @@ int main() { Kokkos::Tools::Experimental::VariableInfo* info) { if (info->type != Kokkos::Tools::Experimental::ValueType::kokkos_value_int64) { - throw(std::runtime_error("Context Variable has wrong type")); + Kokkos::abort("Context Variable has wrong type"); } }); tuningVariableInfo.candidates = allowed_values; @@ -93,7 +93,7 @@ int main() { auto candidate_values = tuning_values[0].metadata->candidates; if (context_values[0].value.int_value != expectedContextVariableValue) { - throw std::runtime_error( + Kokkos::abort( "Context variables not correctly passed to tuning callbacks"); } int tuningVariableSetSize = candidate_values.set.size; @@ -112,7 +112,7 @@ int main() { std::cout << tuningValues[0].value.int_value << "," << candidate_value_vector[4] << std::endl; if (tuningValues[0].value.int_value != candidate_value_vector[4]) { - throw std::runtime_error("Tuning value return is incorrect"); + Kokkos::abort("Tuning value return is incorrect"); } Kokkos::Tools::Experimental::end_context(context); @@ -145,9 +145,9 @@ int main() { std::cout << "Expect " << expectedNumberOfContextVariables << ", have " << num_context_variables << std::endl; if (num_context_variables != expectedNumberOfContextVariables) { - throw( - std::runtime_error("Incorrect number of context variables in " - "nested tuning contexts")); + Kokkos::abort( + "Incorrect number of context variables in nested tuning " + "contexts"); } }); Kokkos::Tools::Experimental::set_input_values(outerContext, 1, diff --git a/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp b/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp index 24b9bc3739e13d8a3824f013a188aa069a7a55b2..a162c939a574690d0c22f7be7f2d53bb72525de4 100644 --- a/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp +++ b/packages/kokkos/core/unit_test/tools/include/ToolTestingUtilities.hpp @@ -64,6 +64,27 @@ struct EventBase; // forward declaration using EventBasePtr = std::shared_ptr<EventBase>; using event_vector = std::vector<EventBasePtr>; +// unique identifier for derived event classes +template <class EventDerived, + std::enable_if_t<std::is_base_of_v<EventBase, EventDerived> && + std::is_final_v<EventDerived>>* = nullptr> +uintptr_t event_type_uid() { + static char x{}; + return reinterpret_cast<uintptr_t>(&x); +} + +// runtime check to determine if an event object of EventBase class is in fact +// of a particular EventDerived class in which case it is safe to downcast +// (essentially some hand-rolled form of dynamic pointer casting w/o using RTTI) +template < + class EventDerived, class SomeEventBasePtr, + std::enable_if_t<std::is_base_of_v<EventBase, EventDerived> && + std::is_final_v<EventDerived> && + std::is_same_v<SomeEventBasePtr, EventBasePtr>>* = nullptr> +bool is_a(SomeEventBasePtr const& e) { + return e->kind() == event_type_uid<EventDerived>(); +} + /** * @brief In order to call some arbitrary set of lambdas representing matchers, * we need the ability to look at a lambda, and deduce its arguments. @@ -103,10 +124,10 @@ struct function_traits<R (*)(A...)> { constexpr static int num_arguments = sizeof...(A); template <class Call, class... Args> static auto invoke_as(const Call& call, Args&&... args) { - if (!(std::dynamic_pointer_cast<A>(std::forward<Args>(args)) && ...)) { + if (!(is_a<A>(std::forward<Args>(args)) && ...)) { return MatchDiagnostic{false, {"Types didn't match on arguments"}}; } - return call(*std::dynamic_pointer_cast<A>(std::forward<Args>(args))...); + return call(*std::static_pointer_cast<A>(std::forward<Args>(args))...); } }; @@ -127,10 +148,10 @@ struct function_traits<R (C::*)(A...)> { constexpr static int num_arguments = sizeof...(A); template <class Call, class... Args> static auto invoke_as(const Call& call, Args&&... args) { - if (!(std::dynamic_pointer_cast<A>(std::forward<Args>(args)) && ...)) { + if (!(is_a<A>(std::forward<Args>(args)) && ...)) { return MatchDiagnostic{false, {"Types didn't match on arguments"}}; } - return call(*std::dynamic_pointer_cast<A>(std::forward<Args>(args))...); + return call(*std::static_pointer_cast<A>(std::forward<Args>(args))...); } }; @@ -152,10 +173,10 @@ struct function_traits<R (C::*)(A...) const> // const constexpr static int num_arguments = sizeof...(A); template <class Call, class... Args> static auto invoke_as(const Call& call, Args&&... args) { - if (!(std::dynamic_pointer_cast<A>(std::forward<Args>(args)) && ...)) { + if (!(is_a<A>(std::forward<Args>(args)) && ...)) { return MatchDiagnostic{false, {"Types didn't match on arguments"}}; } - return call(*std::dynamic_pointer_cast<A>(std::forward<Args>(args))...); + return call(*std::static_pointer_cast<A>(std::forward<Args>(args))...); } }; @@ -168,7 +189,7 @@ struct function_traits<R (C::*)(A...) const> // const * @tparam T The functor type */ template <typename T> -struct function_traits<T, std::void_t<decltype(&T::operator())> > +struct function_traits<T, std::void_t<decltype(&T::operator())>> : public function_traits<decltype(&T::operator())> {}; /** @@ -284,6 +305,12 @@ struct EventBase { using PtrHandle = const void* const; virtual ~EventBase() = default; virtual std::string descriptor() const = 0; + virtual uintptr_t kind() const = 0; +}; + +template <class Derived> +struct UniquelyIdentifiableEventType : public EventBase { + uintptr_t kind() const override { return event_type_uid<Derived>(); } }; /** @@ -293,7 +320,7 @@ struct EventBase { * @tparam Derived CRTP, intended for use with dynamic_casts */ template <class Derived> -struct BeginOperation : public EventBase { +struct BeginOperation : public UniquelyIdentifiableEventType<Derived> { const std::string name; const uint32_t deviceID; uint64_t kID; @@ -317,7 +344,7 @@ struct BeginOperation : public EventBase { * same type */ template <class Derived> -struct EndOperation : public EventBase { +struct EndOperation : public UniquelyIdentifiableEventType<Derived> { uint64_t kID; EndOperation(uint64_t k) : kID(k) {} @@ -336,7 +363,8 @@ struct EndOperation : public EventBase { * type. So the different type names here are meaningful, even though the * classes are empty */ -struct BeginParallelForEvent : public BeginOperation<BeginParallelForEvent> { +struct BeginParallelForEvent final + : public BeginOperation<BeginParallelForEvent> { static const std::string& begin_op_name() { static std::string value = "BeginParallelFor"; return value; @@ -344,7 +372,7 @@ struct BeginParallelForEvent : public BeginOperation<BeginParallelForEvent> { BeginParallelForEvent(std::string n, const uint32_t devID, uint64_t k) : BeginOperation<BeginParallelForEvent>(n, devID, k) {} }; -struct BeginParallelReduceEvent +struct BeginParallelReduceEvent final : public BeginOperation<BeginParallelReduceEvent> { static const std::string& begin_op_name() { static std::string value = "BeginParallelReduce"; @@ -354,7 +382,8 @@ struct BeginParallelReduceEvent BeginParallelReduceEvent(std::string n, const uint32_t devID, uint64_t k) : BeginOperation<BeginParallelReduceEvent>(n, devID, k) {} }; -struct BeginParallelScanEvent : public BeginOperation<BeginParallelScanEvent> { +struct BeginParallelScanEvent final + : public BeginOperation<BeginParallelScanEvent> { static const std::string& begin_op_name() { static std::string value = "BeginParallelScan"; return value; @@ -363,7 +392,7 @@ struct BeginParallelScanEvent : public BeginOperation<BeginParallelScanEvent> { BeginParallelScanEvent(std::string n, const uint32_t devID, uint64_t k) : BeginOperation<BeginParallelScanEvent>(n, devID, k) {} }; -struct BeginFenceEvent : public BeginOperation<BeginFenceEvent> { +struct BeginFenceEvent final : public BeginOperation<BeginFenceEvent> { static const std::string& begin_op_name() { static std::string value = "BeginFence"; return value; @@ -373,7 +402,7 @@ struct BeginFenceEvent : public BeginOperation<BeginFenceEvent> { : BeginOperation<BeginFenceEvent>(n, devID, k) {} }; -struct EndParallelForEvent : public EndOperation<EndParallelForEvent> { +struct EndParallelForEvent final : public EndOperation<EndParallelForEvent> { static const std::string& end_op_name() { static std::string value = "EndParallelFor"; return value; @@ -381,7 +410,8 @@ struct EndParallelForEvent : public EndOperation<EndParallelForEvent> { EndParallelForEvent(uint64_t k) : EndOperation<EndParallelForEvent>(k) {} }; -struct EndParallelReduceEvent : public EndOperation<EndParallelReduceEvent> { +struct EndParallelReduceEvent final + : public EndOperation<EndParallelReduceEvent> { static const std::string& end_op_name() { static std::string value = "EndParallelReduce"; return value; @@ -390,7 +420,7 @@ struct EndParallelReduceEvent : public EndOperation<EndParallelReduceEvent> { EndParallelReduceEvent(uint64_t k) : EndOperation<EndParallelReduceEvent>(k) {} }; -struct EndParallelScanEvent : public EndOperation<EndParallelScanEvent> { +struct EndParallelScanEvent final : public EndOperation<EndParallelScanEvent> { static const std::string& end_op_name() { static std::string value = "EndParallelScan"; return value; @@ -398,7 +428,7 @@ struct EndParallelScanEvent : public EndOperation<EndParallelScanEvent> { EndParallelScanEvent(uint64_t k) : EndOperation<EndParallelScanEvent>(k) {} }; -struct EndFenceEvent : public EndOperation<EndFenceEvent> { +struct EndFenceEvent final : public EndOperation<EndFenceEvent> { static const std::string& end_op_name() { static std::string value = "EndFence"; return value; @@ -407,7 +437,7 @@ struct EndFenceEvent : public EndOperation<EndFenceEvent> { EndFenceEvent(uint64_t k) : EndOperation<EndFenceEvent>(k) {} }; -struct InitEvent : public EventBase { +struct InitEvent final : public UniquelyIdentifiableEventType<InitEvent> { int load_sequence; uint64_t version_number; uint32_t num_device_infos; @@ -425,11 +455,13 @@ struct InitEvent : public EventBase { num_device_infos(n_d_i), device_infos(d_i) {} }; -struct FinalizeEvent : public EventBase { +struct FinalizeEvent final + : public UniquelyIdentifiableEventType<FinalizeEvent> { std::string descriptor() const override { return "FinalizeEvent{}"; } }; -struct ParseArgsEvent : public EventBase { +struct ParseArgsEvent final + : public UniquelyIdentifiableEventType<ParseArgsEvent> { int num_args; char** args; @@ -444,26 +476,29 @@ struct ParseArgsEvent : public EventBase { } ParseArgsEvent(int n_a, char** a) : num_args(n_a), args(a) {} }; -struct PrintHelpEvent : public EventBase { +struct PrintHelpEvent final + : public UniquelyIdentifiableEventType<PrintHelpEvent> { char* prog_name; std::string descriptor() const override { return "PrintHelpEvent { Program Name: \"" + std::string(prog_name) + "\"}"; } PrintHelpEvent(char* p_n) : prog_name(p_n) {} }; -struct PushRegionEvent : public EventBase { +struct PushRegionEvent final + : public UniquelyIdentifiableEventType<PushRegionEvent> { std::string name; std::string descriptor() const override { return "PushRegionEvent { Region Name: \"" + name + "\" }"; } PushRegionEvent(std::string n) : name(n) {} }; -struct PopRegionEvent : public EventBase { +struct PopRegionEvent final + : public UniquelyIdentifiableEventType<PopRegionEvent> { std::string descriptor() const override { return "PopRegionEvent{}"; } }; template <class Derived> -struct DataEvent : public EventBase { +struct DataEvent : public UniquelyIdentifiableEventType<Derived> { using SpaceHandleType = Kokkos::Profiling::SpaceHandle; SpaceHandleType handle; std::string name; @@ -482,20 +517,21 @@ struct DataEvent : public EventBase { : handle(h), name(n), ptr(p), size(s) {} }; -struct AllocateDataEvent : public DataEvent<AllocateDataEvent> { +struct AllocateDataEvent final : public DataEvent<AllocateDataEvent> { static std::string event_name() { return "AllocateDataEvent"; } AllocateDataEvent(DataEvent::SpaceHandleType h, std::string n, EventBase::PtrHandle p, uint64_t s) : DataEvent<AllocateDataEvent>(h, n, p, s) {} }; -struct DeallocateDataEvent : public DataEvent<DeallocateDataEvent> { +struct DeallocateDataEvent final : public DataEvent<DeallocateDataEvent> { static std::string event_name() { return "DeallocateDataEvent"; } DeallocateDataEvent(DataEvent::SpaceHandleType h, std::string n, EventBase::PtrHandle p, uint64_t s) : DataEvent<DeallocateDataEvent>(h, n, p, s) {} }; -struct CreateProfileSectionEvent : public EventBase { +struct CreateProfileSectionEvent final + : public UniquelyIdentifiableEventType<CreateProfileSectionEvent> { std::string name; uint32_t id; std::string descriptor() const override { @@ -506,7 +542,8 @@ struct CreateProfileSectionEvent : public EventBase { }; template <class Derived> -struct ProfileSectionManipulationEvent : public EventBase { +struct ProfileSectionManipulationEvent + : public UniquelyIdentifiableEventType<Derived> { uint32_t id; std::string descriptor() const override { std::stringstream s; @@ -516,26 +553,26 @@ struct ProfileSectionManipulationEvent : public EventBase { ProfileSectionManipulationEvent(uint32_t d_i) : id(d_i){}; }; -struct StartProfileSectionEvent +struct StartProfileSectionEvent final : public ProfileSectionManipulationEvent<StartProfileSectionEvent> { static std::string event_name() { return "StartProfileSectionEvent"; } StartProfileSectionEvent(uint32_t d_i) : ProfileSectionManipulationEvent<StartProfileSectionEvent>(d_i){}; }; -struct StopProfileSectionEvent +struct StopProfileSectionEvent final : public ProfileSectionManipulationEvent<StopProfileSectionEvent> { static std::string event_name() { return "StopProfileSectionEvent"; } StopProfileSectionEvent(uint32_t d_i) : ProfileSectionManipulationEvent<StopProfileSectionEvent>(d_i){}; }; -struct DestroyProfileSectionEvent +struct DestroyProfileSectionEvent final : public ProfileSectionManipulationEvent<DestroyProfileSectionEvent> { static std::string event_name() { return "DestroyProfileSectionEvent"; } DestroyProfileSectionEvent(uint32_t d_i) : ProfileSectionManipulationEvent<DestroyProfileSectionEvent>(d_i){}; }; -struct ProfileEvent : public EventBase { +struct ProfileEvent final : public UniquelyIdentifiableEventType<ProfileEvent> { std::string name; std::string descriptor() const override { return "ProfileEvent {\"" + name + "\"}"; @@ -543,7 +580,8 @@ struct ProfileEvent : public EventBase { ProfileEvent(std::string n) : name(n) {} }; -struct BeginDeepCopyEvent : public EventBase { +struct BeginDeepCopyEvent final + : public UniquelyIdentifiableEventType<BeginDeepCopyEvent> { using SpaceHandleType = Kokkos::Profiling::SpaceHandle; SpaceHandleType src_handle; std::string src_name; @@ -573,12 +611,13 @@ struct BeginDeepCopyEvent : public EventBase { dst_ptr(d_p), size(s) {} }; -struct EndDeepCopyEvent : public EventBase { +struct EndDeepCopyEvent final + : public UniquelyIdentifiableEventType<EndDeepCopyEvent> { std::string descriptor() const override { return "EndDeepCopyEvent{}"; } }; template <class Derived> -struct DualViewEvent : public EventBase { +struct DualViewEvent : public UniquelyIdentifiableEventType<Derived> { std::string name; EventBase::PtrHandle ptr; bool is_device; @@ -591,18 +630,19 @@ struct DualViewEvent : public EventBase { return s.str(); } }; -struct DualViewModifyEvent : public DualViewEvent<DualViewModifyEvent> { +struct DualViewModifyEvent final : public DualViewEvent<DualViewModifyEvent> { static std::string event_name() { return "DualViewModifyEvent"; } DualViewModifyEvent(std::string n, EventBase::PtrHandle p, bool i_d) : DualViewEvent(n, p, i_d) {} }; -struct DualViewSyncEvent : public DualViewEvent<DualViewSyncEvent> { +struct DualViewSyncEvent final : public DualViewEvent<DualViewSyncEvent> { static std::string event_name() { return "DualViewSyncEvent"; } DualViewSyncEvent(std::string n, EventBase::PtrHandle p, bool i_d) : DualViewEvent(n, p, i_d) {} }; -struct DeclareMetadataEvent : public EventBase { +struct DeclareMetadataEvent final + : public UniquelyIdentifiableEventType<DeclareMetadataEvent> { std::string key; std::string value; std::string descriptor() const override { @@ -611,7 +651,9 @@ struct DeclareMetadataEvent : public EventBase { DeclareMetadataEvent(std::string k, std::string v) : key(k), value(v) {} }; -struct ProvideToolProgrammingInterfaceEvent : public EventBase { +struct ProvideToolProgrammingInterfaceEvent final + : public UniquelyIdentifiableEventType< + ProvideToolProgrammingInterfaceEvent> { using Interface = Kokkos::Tools::Experimental::ToolProgrammingInterface; uint32_t num_functions; @@ -623,7 +665,8 @@ struct ProvideToolProgrammingInterfaceEvent : public EventBase { std::to_string(num_functions) + "}"; } }; -struct RequestToolSettingsEvent : public EventBase { +struct RequestToolSettingsEvent final + : public UniquelyIdentifiableEventType<RequestToolSettingsEvent> { using Settings = Kokkos::Tools::Experimental::ToolSettings; uint32_t num_settings; @@ -636,7 +679,7 @@ struct RequestToolSettingsEvent : public EventBase { }; template <class Derived> -struct TypeDeclarationEvent : public EventBase { +struct TypeDeclarationEvent : public UniquelyIdentifiableEventType<Derived> { std::string name; size_t variable_id; Kokkos::Tools::Experimental::VariableInfo info; @@ -648,14 +691,14 @@ struct TypeDeclarationEvent : public EventBase { Kokkos::Tools::Experimental::VariableInfo i) : name(n), variable_id(v_i), info(i) {} }; -struct DeclareOutputTypeEvent +struct DeclareOutputTypeEvent final : public TypeDeclarationEvent<DeclareOutputTypeEvent> { static std::string event_name() { return "DeclarateOutputTypeEvent"; } DeclareOutputTypeEvent(std::string n, size_t v_i, Kokkos::Tools::Experimental::VariableInfo i) : TypeDeclarationEvent(n, v_i, i) {} }; -struct DeclareInputTypeEvent +struct DeclareInputTypeEvent final : public TypeDeclarationEvent<DeclareInputTypeEvent> { static std::string event_name() { return "DeclareInputTypeEvent"; } DeclareInputTypeEvent(std::string n, size_t v_i, @@ -663,7 +706,8 @@ struct DeclareInputTypeEvent : TypeDeclarationEvent(n, v_i, i) {} }; -struct RequestOutputValuesEvent : public EventBase { +struct RequestOutputValuesEvent final + : public UniquelyIdentifiableEventType<RequestOutputValuesEvent> { size_t context; size_t num_inputs; std::vector<Kokkos::Tools::Experimental::VariableValue> inputs; @@ -683,14 +727,16 @@ struct RequestOutputValuesEvent : public EventBase { : context(c), num_inputs(n_i), inputs(i), num_outputs(n_o), outputs(o) {} }; -struct BeginContextEvent : public EventBase { +struct BeginContextEvent final + : public UniquelyIdentifiableEventType<BeginContextEvent> { size_t context; std::string descriptor() const override { return "ContextBeginEvent{ " + std::to_string(context) + "}"; } BeginContextEvent(size_t c) : context(c) {} }; -struct EndContextEvent : public EventBase { +struct EndContextEvent final + : public UniquelyIdentifiableEventType<EndContextEvent> { size_t context; Kokkos::Tools::Experimental::VariableValue value; std::string descriptor() const override { @@ -700,7 +746,8 @@ struct EndContextEvent : public EventBase { : context(c), value(v) {} }; -struct OptimizationGoalDeclarationEvent : public EventBase { +struct OptimizationGoalDeclarationEvent final + : public UniquelyIdentifiableEventType<OptimizationGoalDeclarationEvent> { size_t context; Kokkos::Tools::Experimental::OptimizationGoal goal; std::string descriptor() const override { @@ -925,7 +972,8 @@ static uint64_t last_kernel_id; static uint32_t last_section_id; /** Subscribes to all of the requested callbacks */ -static void set_tool_events_impl(const ToolValidatorConfiguration& config) { +static inline void set_tool_events_impl( + const ToolValidatorConfiguration& config) { Kokkos::Tools::Experimental::pause_tools(); // remove all events if (config.profiling.kernels) { Kokkos::Tools::Experimental::set_begin_parallel_for_callback( diff --git a/packages/kokkos/core/unit_test/view/TestBasicView.hpp b/packages/kokkos/core/unit_test/view/TestBasicView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b7ba7ec694b902c2faf4cd371da4bf68f3c94c32 --- /dev/null +++ b/packages/kokkos/core/unit_test/view/TestBasicView.hpp @@ -0,0 +1,264 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <type_traits> + +using ExecutionSpace = TEST_EXECSPACE; + +namespace { +template <class ExecutionSpace, class Extents> +auto make_spanning_mdrange_policy_from_extents_impl(const Extents &extents, + std::index_sequence<0>) { + return Kokkos::RangePolicy<ExecutionSpace>{0, extents.extent(0)}; +} + +template <class ExecutionSpace, class Extents, std::size_t... Indices> +auto make_spanning_mdrange_policy_from_extents_impl( + const Extents &extents, std::index_sequence<Indices...>) { + using index_type = typename Extents::index_type; + constexpr auto rank = Extents::rank(); + return Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<rank>>{ + {(static_cast<index_type>(Indices * 0))...}, + {extents.extent(Indices)...}}; +} + +template <class ExecutionSpace, class Extents> +auto make_spanning_mdrange_policy_from_extents(const Extents &extents) { + return make_spanning_mdrange_policy_from_extents_impl<ExecutionSpace>( + extents, std::make_index_sequence<Extents::rank()>{}); +} + +template <class T, class ExtentsType> +void test_default_constructor() { + using extents_type = ExtentsType; + using layout_type = Kokkos::Experimental::layout_right_padded<>; + using accessor_type = Kokkos::Impl::CheckedReferenceCountedAccessor< + T, typename ExecutionSpace::memory_space>; + using view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, accessor_type>; + view_type view; + + EXPECT_FALSE(view.data_handle().has_record()); + EXPECT_EQ(view.data_handle().get(), nullptr); + EXPECT_EQ(view.extents(), extents_type{}); + EXPECT_EQ(view.data_handle().use_count(), 0); + EXPECT_TRUE(view.is_exhaustive()); + EXPECT_EQ(view.data_handle().get_label(), ""); + EXPECT_TRUE(view.empty()); + EXPECT_EQ(view.size(), 0u); +} + +TEST(TEST_CATEGORY, basic_view_default_ctor) { + test_default_constructor<double, Kokkos::extents<std::size_t, 1>>(); +} + +template <class T, class ExtentsType> +void test_extents_constructor(const ExtentsType &extents) { + using extents_type = ExtentsType; + using layout_type = Kokkos::Experimental::layout_right_padded<>; + using accessor_type = Kokkos::Impl::CheckedReferenceCountedAccessor< + T, typename ExecutionSpace::memory_space>; + using view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, accessor_type>; + + view_type view("test_view", extents); + + EXPECT_TRUE(view.data_handle().has_record()); + EXPECT_NE(view.data_handle().get(), nullptr); + EXPECT_EQ(view.extents(), extents); + EXPECT_EQ(view.data_handle().use_count(), 1); + EXPECT_TRUE(view.is_exhaustive()); + EXPECT_EQ(view.data_handle().get_label(), "test_view"); + size_t expected_size = 1; + // Avoid pointless comparison of unsigned warning for rank==0 + for (int r = 0; r < static_cast<int>(view_type::rank()); r++) + expected_size *= extents.extent(r); + EXPECT_EQ(view.size(), expected_size); + EXPECT_EQ(view.empty(), expected_size == 0u); +} + +TEST(TEST_CATEGORY, basic_view_extents_ctor) { + test_extents_constructor<double>( + Kokkos::extents<std::size_t, 2, Kokkos::dynamic_extent, 4>(8)); + test_extents_constructor<double>( + Kokkos::extents<std::size_t, 2, Kokkos::dynamic_extent, 4>(0)); + test_extents_constructor<std::size_t>(Kokkos::extents<std::size_t, 2, 4>()); + test_extents_constructor<int>(Kokkos::extents<std::size_t>()); +} + +template <class T, template <std::size_t> class LayoutType, class ExtentsType> +void test_mapping_constructor(const ExtentsType &extents, std::size_t padding) { + using extents_type = ExtentsType; + using layout_type = LayoutType<Kokkos::dynamic_extent>; + using mapping_type = typename layout_type::template mapping<ExtentsType>; + using accessor_type = Kokkos::Impl::CheckedReferenceCountedAccessor< + T, typename ExecutionSpace::memory_space>; + using view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, accessor_type>; + static_assert(std::is_same_v<typename view_type::mapping_type, mapping_type>); + + auto mapping = mapping_type(extents, padding); + + view_type view("test_view", mapping); + + EXPECT_TRUE(view.data_handle().has_record()); + EXPECT_NE(view.data_handle().get(), nullptr); + EXPECT_EQ(view.data_handle().use_count(), 1); + EXPECT_EQ(view.data_handle().get_label(), "test_view"); + EXPECT_EQ(view.extents(), mapping.extents()); + EXPECT_EQ(view.is_exhaustive(), mapping.is_exhaustive()); + size_t expected_size = 1; + // Avoid pointless comparison of unsigned warning for rank==0 + for (int r = 0; r < static_cast<int>(view_type::rank()); r++) + expected_size *= view.extent(r); + EXPECT_EQ(view.size(), expected_size); + EXPECT_EQ(view.empty(), expected_size == 0u); +} + +TEST(TEST_CATEGORY, basic_view_mapping_ctor_right) { + test_mapping_constructor<double, Kokkos::Experimental::layout_left_padded>( + Kokkos::extents<std::size_t, 2, Kokkos::dynamic_extent>(2, 5), 8); + test_mapping_constructor<std::size_t, + Kokkos::Experimental::layout_left_padded>( + Kokkos::extents<std::size_t>(), 4); + test_mapping_constructor<double, Kokkos::Experimental::layout_left_padded>( + Kokkos::extents<std::size_t, 2, 3>(), 9); + test_mapping_constructor<int, Kokkos::Experimental::layout_right_padded>( + Kokkos::extents<std::size_t, 2, Kokkos::dynamic_extent>(2, 5), 8); + test_mapping_constructor<double, Kokkos::Experimental::layout_right_padded>( + Kokkos::extents<std::size_t>(), 4); + test_mapping_constructor<unsigned, Kokkos::Experimental::layout_right_padded>( + Kokkos::extents<std::size_t, 2, 3>(), 9); +} + +template <class ViewType> +struct MDRangeTestFunctor { + ViewType view; + template <class... Idxs> + KOKKOS_FUNCTION void operator()(Idxs... idxs) const { + view(idxs...) = (idxs + ...); + } +}; + +template <class T, class LayoutType, class ExtentsType> +void test_access_with_extents(const ExtentsType &extents) { + using extents_type = ExtentsType; + using layout_type = Kokkos::Experimental::layout_right_padded<>; + using accessor_type = Kokkos::Impl::CheckedReferenceCountedAccessor< + T, typename ExecutionSpace::memory_space>; + using view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, accessor_type>; + + auto view = view_type("test_view", extents); + + EXPECT_TRUE(view.data_handle().has_record()); + EXPECT_NE(view.data_handle().get(), nullptr); + + auto mdrange_policy = + make_spanning_mdrange_policy_from_extents<ExecutionSpace>(extents); + + Kokkos::parallel_for(mdrange_policy, MDRangeTestFunctor<view_type>{view}); +} + +template <class T, class LayoutType> +void test_access() { + test_access_with_extents<T, LayoutType>(Kokkos::extents<std::size_t, 5>()); + test_access_with_extents<T, LayoutType>( + Kokkos::extents<std::size_t, 5, 10>()); + test_access_with_extents<T, LayoutType>( + Kokkos::extents<std::size_t, 5, 2, 2, 2, 2, 2>()); +} + +TEST(TEST_CATEGORY, basic_view_access) { + test_access<double, Kokkos::Experimental::layout_left_padded< + Kokkos::dynamic_extent>>(); + test_access<std::size_t, Kokkos::Experimental::layout_right_padded< + Kokkos::dynamic_extent>>(); +} + +#if 0 // TODO: this test should be active after View is put on top of BasicView + template <class T, template <std::size_t> class LayoutType, class SrcViewType, + class ExtentsType> + void test_construct_from_view(const ExtentsType &extents, + std::size_t padding) { + using extents_type = ExtentsType; + using layout_type = LayoutType<Kokkos::dynamic_extent>; + using mapping_type = typename layout_type::template mapping<ExtentsType>; + using accessor_type = Kokkos::Impl::CheckedReferenceCountedAccessor< + T, typename ExecutionSpace::memory_space>; + using basic_view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, accessor_type>; + using view_type = SrcViewType; + static_assert(std::is_constructible_v<basic_view_type, SrcViewType>); + } +#endif + +#if 0 // TODO: this test should be active after View is put on top of BasicView +TEST(TEST_CATEGORY, basic_view_view_ctor) { + test_construct_from_view<double, + Kokkos::Experimental::layout_left_padded, + Kokkos::View<double[3], Kokkos::LayoutLeft, ExecutionSpace>>( + Kokkos::extents<std::size_t, 3>(), 0); + + test_construct_from_view<size_t, + Kokkos::Experimental::layout_left_padded, + Kokkos::View<double[3], Kokkos::LayoutLeft, ExecutionSpace>>( + Kokkos::extents<std::size_t, Kokkos::dynamic_extent>(3), 0); +} +#endif + +template <class T> +void test_atomic_accessor() { + using extents_type = Kokkos::extents<int, 10, 12, 30>; + using layout_type = Kokkos::Experimental::layout_right_padded<>; + using accessor_type = + Kokkos::Impl::CheckedReferenceCountedRelaxedAtomicAccessor< + T, typename ExecutionSpace::memory_space>; + using view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, accessor_type>; + using um_accessor_type = Kokkos::Impl::CheckedRelaxedAtomicAccessor< + T, typename ExecutionSpace::memory_space>; + using um_view_type = + Kokkos::Impl::BasicView<T, extents_type, layout_type, um_accessor_type>; + + extents_type extents{}; + auto view = view_type("test_view", extents); + um_view_type um_view(view); + + EXPECT_TRUE(view.data_handle().has_record()); + EXPECT_NE(view.data_handle().get(), nullptr); + + auto mdrange_policy = + make_spanning_mdrange_policy_from_extents<ExecutionSpace>(extents); + + Kokkos::parallel_for(mdrange_policy, MDRangeTestFunctor<view_type>{view}); + Kokkos::parallel_for(mdrange_policy, + MDRangeTestFunctor<um_view_type>{um_view}); +} + +TEST(TEST_CATEGORY, basic_view_atomic_accessor) { + test_atomic_accessor<int>(); + test_atomic_accessor<double>(); +// FIXME OPENACC atomics +#ifndef KOKKOS_ENABLE_OPENACC + test_atomic_accessor<Kokkos::complex<double>>(); +#endif +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/view/TestBasicViewMDSpanConversion.cpp b/packages/kokkos/core/unit_test/view/TestBasicViewMDSpanConversion.cpp new file mode 100644 index 0000000000000000000000000000000000000000..233e95534e9c4be3d1919377a65bf9b9efe85ff3 --- /dev/null +++ b/packages/kokkos/core/unit_test/view/TestBasicViewMDSpanConversion.cpp @@ -0,0 +1,95 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> +#include <type_traits> + +#if 0 // TODO: after View is using BasicView this should be true +static_assert( + std::is_convertible_v< + Kokkos::View<long long ****, Kokkos::LayoutRight, Kokkos::Serial>, + Kokkos::Impl::BasicView<long long, Kokkos::dextents<size_t, 4>, + Kokkos::Experimental::layout_right_padded<>, + Kokkos::Impl::CheckedReferenceCountedAccessor< + long long, Kokkos::HostSpace>>>); +#endif + +static_assert( + std::is_convertible_v< + Kokkos::Impl::BasicView<long long, Kokkos::dextents<size_t, 4>, + Kokkos::Experimental::layout_right_padded<>, + Kokkos::Impl::CheckedReferenceCountedAccessor< + long long, Kokkos::HostSpace>>, + Kokkos::Impl::BasicView<const long long, Kokkos::dextents<size_t, 4>, + Kokkos::Experimental::layout_right_padded<>, + Kokkos::Impl::CheckedReferenceCountedAccessor< + const long long, Kokkos::HostSpace>>>); +#if 0 // TODO: after View is using BasicView this should be true +static_assert( + std::is_convertible_v< + Kokkos::View<long long ****, Kokkos::LayoutRight, Kokkos::Serial>, + Kokkos::Impl::BasicView<const long long, Kokkos::dextents<size_t, 4>, + Kokkos::Experimental::layout_right_padded<>, + Kokkos::Impl::CheckedReferenceCountedAccessor< + const long long, Kokkos::HostSpace>>>); + +using test_atomic_view = Kokkos::View<double *, Kokkos::Serial, + Kokkos::MemoryTraits<Kokkos::Atomic>>; +static_assert(std::is_same_v< + decltype(std::declval<test_atomic_view>()(std::declval<int>())), + desul::AtomicRef<double, desul::MemoryOrderRelaxed, + desul::MemoryScopeDevice>>); +#endif + +static_assert(std::is_convertible_v<Kokkos::default_accessor<double>, + Kokkos::Impl::ReferenceCountedAccessor< + double, Kokkos::HostSpace, + Kokkos::default_accessor<double>>>); + +static_assert(std::is_constructible_v<Kokkos::default_accessor<const double>, + Kokkos::default_accessor<double>>); + +static_assert(std::is_convertible_v<Kokkos::default_accessor<double>, + Kokkos::default_accessor<const double>>); + +static_assert( + std::is_constructible_v< + Kokkos::Impl::ReferenceCountedAccessor< + const double, Kokkos::HostSpace, + Kokkos::default_accessor<const double>>, + Kokkos::Impl::ReferenceCountedAccessor< + double, Kokkos::HostSpace, Kokkos::default_accessor<double>>>); + +static_assert(std::is_convertible_v< + Kokkos::Impl::ReferenceCountedAccessor< + double, Kokkos::HostSpace, Kokkos::default_accessor<double>>, + Kokkos::Impl::ReferenceCountedAccessor< + const double, Kokkos::HostSpace, + Kokkos::default_accessor<const double>>>); + +static_assert(std::is_constructible_v<Kokkos::default_accessor<const double>, + Kokkos::Impl::ReferenceCountedAccessor< + double, Kokkos::HostSpace, + Kokkos::default_accessor<double>>>); + +static_assert( + std::is_convertible_v< + Kokkos::Impl::SpaceAwareAccessor< + Kokkos::HostSpace, + Kokkos::Impl::ReferenceCountedAccessor< + double, Kokkos::HostSpace, Kokkos::default_accessor<double>>>, + Kokkos::Impl::SpaceAwareAccessor< + Kokkos::HostSpace, Kokkos::default_accessor<const double>>>); diff --git a/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp b/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp index b95890614e0a60ff31b1e23b392236d96e63b101..be7052205a7c3968c939fe0d728156dcd20a9a87 100644 --- a/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp +++ b/packages/kokkos/core/unit_test/view/TestExtentsDatatypeConversion.cpp @@ -23,15 +23,14 @@ namespace { // Helper to make static tests more succinct template <typename DataType, typename Extent> -constexpr bool datatype_matches_extent = - std::is_same_v<typename Kokkos::Experimental::Impl::ExtentsFromDataType< - std::size_t, DataType>::type, - Extent>; +constexpr bool datatype_matches_extent = std::is_same_v< + typename Kokkos::Impl::ExtentsFromDataType<std::size_t, DataType>::type, + Extent>; template <typename DataType, typename BaseType, typename Extents> constexpr bool extent_matches_datatype = - std::is_same_v<DataType, typename Kokkos::Experimental::Impl:: - DataTypeFromExtents<BaseType, Extents>::type>; + std::is_same_v<DataType, typename Kokkos::Impl::DataTypeFromExtents< + BaseType, Extents>::type>; // Conversion from DataType to extents // 0-rank view @@ -50,9 +49,9 @@ static_assert( // Both dynamic and static static_assert(datatype_matches_extent< - double* * [3][2][8], + double** [3][2][8], Kokkos::extents<std::size_t, Kokkos::dynamic_extent, - Kokkos::dynamic_extent, std::size_t{3}, + Kokkos::dynamic_extent, std::size_t { 3 }, std::size_t{2}, std::size_t{8}>>); // Conversion from extents to DataType @@ -73,7 +72,7 @@ static_assert(extent_matches_datatype<double[7][5][3], double, // both dynamic and static static_assert( - extent_matches_datatype<double** * [20][45], double, + extent_matches_datatype<double*** [20][45], double, Kokkos::extents<std::size_t, Kokkos::dynamic_extent, Kokkos::dynamic_extent, Kokkos::dynamic_extent, 20, 45>>); diff --git a/packages/kokkos/core/unit_test/view/TestReferenceCountedAccessor.hpp b/packages/kokkos/core/unit_test/view/TestReferenceCountedAccessor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4d4ee212b40ef55171fbea0d262f872de69fa6d2 --- /dev/null +++ b/packages/kokkos/core/unit_test/view/TestReferenceCountedAccessor.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +#include <gtest/gtest.h> + +namespace { +using element_t = float; +using memory_space_t = TEST_EXECSPACE::memory_space; +using defacc_t = Kokkos::default_accessor<element_t>; +using const_defacc_t = Kokkos::default_accessor<const element_t>; +using acc_t = + Kokkos::Impl::ReferenceCountedAccessor<element_t, memory_space_t, defacc_t>; +using const_acc_t = + Kokkos::Impl::ReferenceCountedAccessor<const element_t, memory_space_t, + const_defacc_t>; +using data_handle_t = typename acc_t::data_handle_type; +using const_data_handle_t = typename const_acc_t::data_handle_type; +} // namespace + +TEST(TEST_CATEGORY, RefCountedAcc_Typedefs) { + static_assert(std::is_same_v<typename acc_t::element_type, element_t>); + static_assert( + std::is_same_v< + typename acc_t::data_handle_type, + Kokkos::Impl::ReferenceCountedDataHandle<element_t, memory_space_t>>); + static_assert( + std::is_same_v<typename acc_t::reference, typename defacc_t::reference>); + static_assert( + std::is_same_v< + typename acc_t::offset_policy, + Kokkos::Impl::ReferenceCountedAccessor< + element_t, memory_space_t, typename defacc_t::offset_policy>>); +} + +template <class T> +KOKKOS_FUNCTION void unused_variable_sink(T) {} + +void test_refcountedacc_ctors() { + Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), KOKKOS_LAMBDA(int) { + // default ctor and non-const to const + { + acc_t acc; + const_acc_t c_acc(acc); + static_assert(!std::is_constructible_v<acc_t, const_acc_t>); + + unused_variable_sink(c_acc); +} +// from default_accessor +{ + defacc_t defacc; + const_defacc_t c_defacc; + acc_t acc(defacc); + const_acc_t c_acc1(defacc); + const_acc_t c_acc2(c_defacc); + static_assert(!std::is_constructible_v<acc_t, const_defacc_t>); + + unused_variable_sink(acc); + unused_variable_sink(c_acc1); + unused_variable_sink(c_acc2); +} +}); +} + +TEST(TEST_CATEGORY, RefCountedAcc_Ctors) { test_refcountedacc_ctors(); } + +void test_refcountedacc_conversion_to_default_acc() { + Kokkos::parallel_for( + Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), KOKKOS_LAMBDA(int) { + // default ctor and non-const to const + acc_t acc; + const_acc_t c_acc; + defacc_t defacc(acc); + const_defacc_t c_defacc1(acc); + const_defacc_t c_defacc2(c_acc); + (void)defacc; + (void)c_defacc1; + (void)c_defacc2; + static_assert(!std::is_constructible_v<defacc_t, const_acc_t>); + }); +} + +TEST(TEST_CATEGORY, RefCountedAcc_ConversionToDefaultAcc) { + test_refcountedacc_conversion_to_default_acc(); +} + +void test_refcountedacc_access() { + element_t* ptr = static_cast<element_t*>( + Kokkos::kokkos_malloc<TEST_EXECSPACE::memory_space>(100 * + sizeof(element_t))); + // Gonna use unmanaged data handles here (i.e. not actually referfence + // counted) + data_handle_t dh(ptr); + const_data_handle_t cdh(ptr); + + Kokkos::View<int, TEST_EXECSPACE> errors("Errors"); + Kokkos::parallel_for( + Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), KOKKOS_LAMBDA(int) { + acc_t acc; + const_acc_t c_acc; + if (&acc.access(dh, 5) != ptr + 5) errors() += 1; + if (&c_acc.access(cdh, 5) != ptr + 5) errors() += 2; + }); + int h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + ASSERT_FALSE(h_errors & 1); + ASSERT_FALSE(h_errors & 2); + Kokkos::kokkos_free<TEST_EXECSPACE>(ptr); +} + +TEST(TEST_CATEGORY, RefCountedAcc_Access) { test_refcountedacc_access(); } + +void test_refcountedacc_conversion() { + Kokkos::parallel_for( + Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), KOKKOS_LAMBDA(int) { + using acc_anonym_t = Kokkos::Impl::ReferenceCountedAccessor< + element_t, Kokkos::AnonymousSpace, defacc_t>; + using const_acc_anonym_t = Kokkos::Impl::ReferenceCountedAccessor< + const element_t, Kokkos::AnonymousSpace, const_defacc_t>; + acc_t acc; + const_acc_t c_acc(acc); + acc_anonym_t acc_anonym(acc); + const_acc_anonym_t c_acc_anonym(acc); + acc = acc_anonym; + c_acc = acc_anonym; + static_assert(!std::is_constructible_v<acc_t, const_acc_t>); + static_assert(!std::is_constructible_v<acc_anonym_t, const_acc_t>); + static_assert( + !std::is_constructible_v<acc_anonym_t, const_acc_anonym_t>); + static_assert( + !std::is_constructible_v<Kokkos::Impl::ReferenceCountedAccessor< + double, memory_space_t, defacc_t>, + acc_t>); + + unused_variable_sink(c_acc); + unused_variable_sink(c_acc_anonym); + }); +} + +TEST(TEST_CATEGORY, RefCountedAcc_Conversion) { + test_refcountedacc_conversion(); +} diff --git a/packages/kokkos/core/unit_test/view/TestReferenceCountedDataHandle.hpp b/packages/kokkos/core/unit_test/view/TestReferenceCountedDataHandle.hpp new file mode 100644 index 0000000000000000000000000000000000000000..249c08578f100fdcfe73f71ad8957f0edf6c7111 --- /dev/null +++ b/packages/kokkos/core/unit_test/view/TestReferenceCountedDataHandle.hpp @@ -0,0 +1,208 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +#include <gtest/gtest.h> + +namespace { +using element_t = float; +using mem_t = typename TEST_EXECSPACE::memory_space; +using data_handle_t = + Kokkos::Impl::ReferenceCountedDataHandle<element_t, mem_t>; +using const_data_handle_t = + Kokkos::Impl::ReferenceCountedDataHandle<const element_t, mem_t>; +using data_handle_anonym_t = + Kokkos::Impl::ReferenceCountedDataHandle<element_t, Kokkos::AnonymousSpace>; +using const_data_handle_anonym_t = + Kokkos::Impl::ReferenceCountedDataHandle<const element_t, + Kokkos::AnonymousSpace>; + +} // namespace + +TEST(TEST_CATEGORY, RefCountedDataHandle_Typedefs) { + static_assert(std::is_same_v<data_handle_t::value_type, element_t>); + static_assert(std::is_same_v<data_handle_t::pointer, element_t*>); + static_assert(std::is_same_v<data_handle_t::reference, element_t&>); + static_assert(std::is_same_v<data_handle_t::memory_space, mem_t>); +} + +template <class DataHandleType, class ConstDataHandleType> +void test_ref_counted_data_handle() { + auto shared_alloc = + Kokkos::Impl::make_shared_allocation_record<element_t, mem_t, + TEST_EXECSPACE>( + 100, "Test", mem_t(), std::optional<TEST_EXECSPACE>(std::nullopt), + std::bool_constant<true>(), // init + std::bool_constant<false>()); // sequential_host_init + + element_t* ptr = static_cast<element_t*>(shared_alloc->data()); + const element_t* c_ptr = ptr; + DataHandleType dh(shared_alloc); + ASSERT_EQ(dh.use_count(), 1); + ASSERT_EQ(dh.get_label(), std::string("Test")); + ASSERT_EQ(dh.get(), ptr); + ASSERT_EQ(dh.has_record(), true); + { + element_t* ptr_tmp(dh); + ASSERT_EQ(ptr_tmp, ptr); + static_assert(!std::is_convertible_v<data_handle_t, element_t*>); + } + { + ConstDataHandleType c_dh(dh); + ASSERT_EQ(dh.use_count(), 2); + ASSERT_EQ(c_dh.use_count(), 2); + } + ASSERT_EQ(dh.use_count(), 1); + + DataHandleType um_dh(ptr); + ASSERT_EQ(um_dh.get(), ptr); + ASSERT_EQ(um_dh.has_record(), false); + + DataHandleType dh_offset(dh, ptr + 5); + ASSERT_EQ(dh_offset.use_count(), 2); + ASSERT_EQ(dh_offset.get(), ptr + 5); + ASSERT_EQ(dh_offset.get_label(), std::string("Test")); + ASSERT_EQ(dh_offset.has_record(), true); + { + element_t* ptr_tmp(dh_offset); + ASSERT_EQ(ptr_tmp, ptr + 5); + } + Kokkos::View<int, TEST_EXECSPACE> errors("Errors"); + + // clang-format screws the following pieces up for some reason + // Tested with 16 and with 18 to the same effect + // clang-format off + Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), KOKKOS_LAMBDA(int) { + + // default ctor and non-const to const + { + DataHandleType dh2(dh); + if(dh2.get() != ptr) errors() += 1; + ConstDataHandleType c_dh2(dh); + ConstDataHandleType c_dh3(c_dh2); + static_assert(!std::is_constructible_v<data_handle_t, const_data_handle_t>); + } + + { + // ctor from pointer + DataHandleType dh2(ptr); + if (dh2.get() != ptr) errors() += 2; + ConstDataHandleType c_dh1(ptr); + if (c_dh1.get() != ptr) errors() += 4; + ConstDataHandleType c_dh2(c_ptr); + if (c_dh2.get() != ptr) errors() += 8; + static_assert(!std::is_constructible_v<data_handle_t, decltype(c_ptr)>); + } + + // ctor for subviews + { + DataHandleType dh2(dh, ptr + 5); + if (dh2.get() != ptr + 5) errors() += 16; + } + }); + + int h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + ASSERT_FALSE(h_errors & 1); + ASSERT_FALSE(h_errors & 2); + ASSERT_FALSE(h_errors & 4); + ASSERT_FALSE(h_errors & 8); + ASSERT_FALSE(h_errors & 16); +} +// clang-format on + +TEST(TEST_CATEGORY, RefCountedDataHandle) { + test_ref_counted_data_handle<data_handle_t, const_data_handle_t>(); +} + +TEST(TEST_CATEGORY, RefCountedDataHandleAnonym) { + test_ref_counted_data_handle<data_handle_anonym_t, + const_data_handle_anonym_t>(); +} + +template <class T> +KOKKOS_FUNCTION void unused_variable_sink(T) {} + +void test_ref_counted_data_handle_conversion() { + auto shared_alloc1 = + Kokkos::Impl::make_shared_allocation_record<element_t, mem_t, + TEST_EXECSPACE>( + 100, "Test1", mem_t(), std::optional<TEST_EXECSPACE>(std::nullopt), + std::bool_constant<true>(), // init + std::bool_constant<false>()); // sequential_host_init + + element_t* ptr1 = static_cast<element_t*>(shared_alloc1->data()); + const element_t* c_ptr1 = ptr1; + unused_variable_sink(c_ptr1); + + data_handle_t dh(shared_alloc1); + ASSERT_EQ(dh.use_count(), 1); + ASSERT_EQ(dh.get_label(), std::string("Test1")); + ASSERT_EQ(dh.get(), ptr1); + ASSERT_EQ(dh.has_record(), true); + + auto shared_alloc2 = + Kokkos::Impl::make_shared_allocation_record<element_t, mem_t, + TEST_EXECSPACE>( + 100, "Test2", mem_t(), std::optional<TEST_EXECSPACE>(std::nullopt), + std::bool_constant<true>(), // init + std::bool_constant<false>()); // sequential_host_init + + element_t* ptr2 = static_cast<element_t*>(shared_alloc2->data()); + const element_t* c_ptr2 = ptr2; + unused_variable_sink(c_ptr2); + + data_handle_anonym_t dha(shared_alloc2); + ASSERT_EQ(dha.use_count(), 1); + ASSERT_EQ(dha.get_label(), std::string("Test2")); + ASSERT_EQ(dha.get(), ptr2); + ASSERT_EQ(dha.has_record(), true); + + { + data_handle_anonym_t dha2(dh); + ASSERT_EQ(dha2.use_count(), 2); + ASSERT_EQ(dha2.get_label(), std::string("Test1")); + ASSERT_EQ(dha2.get(), ptr1); + ASSERT_EQ(dha2.has_record(), true); + + data_handle_t dh2(dha); + ASSERT_EQ(dh2.use_count(), 2); + ASSERT_EQ(dh2.get_label(), std::string("Test2")); + ASSERT_EQ(dh2.get(), ptr2); + ASSERT_EQ(dh2.has_record(), true); + + dha2 = dh2; + ASSERT_EQ(dha2.use_count(), 3); + ASSERT_EQ(dha2.get_label(), std::string("Test2")); + ASSERT_EQ(dha2.get(), ptr2); + ASSERT_EQ(dha2.has_record(), true); + } + + ASSERT_EQ(dh.use_count(), 1); + ASSERT_EQ(dh.get_label(), std::string("Test1")); + ASSERT_EQ(dh.get(), ptr1); + ASSERT_EQ(dh.has_record(), true); + + ASSERT_EQ(dha.use_count(), 1); + ASSERT_EQ(dha.get_label(), std::string("Test2")); + ASSERT_EQ(dha.get(), ptr2); + ASSERT_EQ(dha.has_record(), true); +} + +TEST(TEST_CATEGORY, RefCountedDataHandleConversion) { + test_ref_counted_data_handle_conversion(); +} diff --git a/packages/kokkos/example/CMakeLists.txt b/packages/kokkos/example/CMakeLists.txt index 3920dc9a2776444ceacd1a69a956f5a9453c1266..81363b5e07e0434feaf79de18ca43178540663e4 100644 --- a/packages/kokkos/example/CMakeLists.txt +++ b/packages/kokkos/example/CMakeLists.txt @@ -1,2 +1,9 @@ -KOKKOS_ADD_EXAMPLE_DIRECTORIES(query_device) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(tutorial) +kokkos_add_example_directories(query_device) +if(_DEVICE_PARALLEL STREQUAL "NoTypeDefined" + OR KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + OR KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE + OR KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +) + kokkos_add_example_directories(relocatable_function) +endif() +kokkos_add_example_directories(tutorial) diff --git a/packages/kokkos/example/README b/packages/kokkos/example/README index 66860512448a25c0019b862babbbe08d7cf351cf..2fe87276484873b138ae97ef1a55a0a5230b0e59 100644 --- a/packages/kokkos/example/README +++ b/packages/kokkos/example/README @@ -1,7 +1,7 @@ This directory contains example application proxies that use different parts of Kokkos. If you are looking for the FENL ("finite element -nonlinear" solve) example, it has moved into the LinAlg subpackage of -Tpetra. +nonlinear" solve) example, it has moved into the TrilinosCouplings +package in Trilinos. MANIFEST: diff --git a/packages/kokkos/example/build_cmake_installed/CMakeLists.txt b/packages/kokkos/example/build_cmake_installed/CMakeLists.txt index aaf745b418de894d3403bd0e3176eb972938ac5b..c025f1d7d2894e20eecedcec2d41abc2ba942706 100644 --- a/packages/kokkos/example/build_cmake_installed/CMakeLists.txt +++ b/packages/kokkos/example/build_cmake_installed/CMakeLists.txt @@ -12,6 +12,7 @@ find_package(Kokkos REQUIRED) add_executable(example cmake_example.cpp foo.f) if(CMAKE_Fortran_COMPILER_ID STREQUAL LLVMFlang) set_target_properties(example PROPERTIES LINKER_LANGUAGE Fortran) + target_link_options(example PRIVATE -fno-fortran-main) endif() # This is the only thing required to set up compiler/linker flags diff --git a/packages/kokkos/example/query_device/CMakeLists.txt b/packages/kokkos/example/query_device/CMakeLists.txt index 86956ba3ba4855d0e769d92fd32e7b225c603157..15c4bf6d162fc6e34eea070e1425b84e02ae6909 100644 --- a/packages/kokkos/example/query_device/CMakeLists.txt +++ b/packages/kokkos/example/query_device/CMakeLists.txt @@ -1,12 +1,7 @@ +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - -SET(SOURCES "") -FILE(GLOB SOURCES *.cpp) - -KOKKOS_ADD_EXECUTABLE( - query_device - SOURCES ${SOURCES} -) +set(SOURCES "") +file(GLOB SOURCES *.cpp) +kokkos_add_executable(query_device SOURCES ${SOURCES}) diff --git a/packages/kokkos/example/query_device/query_device.cpp b/packages/kokkos/example/query_device/query_device.cpp index ad6e5f1113e23ca3f1b6b93a58bbbdaafdb76028..936174aee503f7b6acec86a0789992dd647b17da 100644 --- a/packages/kokkos/example/query_device/query_device.cpp +++ b/packages/kokkos/example/query_device/query_device.cpp @@ -19,7 +19,7 @@ #include <Kokkos_Macros.hpp> -//#define USE_MPI +// #define USE_MPI #if defined(USE_MPI) #include <mpi.h> #endif diff --git a/packages/kokkos/example/relocatable_function/CMakeLists.txt b/packages/kokkos/example/relocatable_function/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..51f8d3c6ecdccc7c2f7c1b800ab09a6f81cb2885 --- /dev/null +++ b/packages/kokkos/example/relocatable_function/CMakeLists.txt @@ -0,0 +1,6 @@ +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +kokkos_add_executable(example_relocatable_function SOURCES main.cpp functor.cpp) + +add_test(NAME Kokkos_Example_RelocatableFunction COMMAND Kokkos_example_relocatable_function) diff --git a/packages/kokkos/example/relocatable_function/Makefile b/packages/kokkos/example/relocatable_function/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..09259690c748d68443f8d0183e70d0f5d8d190c3 --- /dev/null +++ b/packages/kokkos/example/relocatable_function/Makefile @@ -0,0 +1,33 @@ +KOKKOS_PATH = ../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/relocatable_function/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +EXE = Kokkos_example_relocatable_function + +LINK = ${CXX} + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) -o $(EXE) + +clean: kokkos-clean + rm -f *.o Kokkos_example_relocatable_function + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/packages/kokkos/example/relocatable_function/functor.cpp similarity index 81% rename from packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp rename to packages/kokkos/example/relocatable_function/functor.cpp index 81e9d5e6aa589a6dfc3b883c173203eb64a50bf3..3d9cde107fa74c7b481ee11bf4935a9d26aa9e62 100644 --- a/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp +++ b/packages/kokkos/example/relocatable_function/functor.cpp @@ -14,6 +14,8 @@ // //@HEADER -// Deprecated file for backward compatibility +#include <Kokkos_Macros.hpp> -#include <impl/Kokkos_ViewMapping.hpp> +KOKKOS_RELOCATABLE_FUNCTION void count_even(const long i, long& lcount) { + lcount += (i % 2) == 0; +} diff --git a/packages/kokkos/example/relocatable_function/main.cpp b/packages/kokkos/example/relocatable_function/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07204410e9bf677bf67f761e23c61648d63d4631 --- /dev/null +++ b/packages/kokkos/example/relocatable_function/main.cpp @@ -0,0 +1,50 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include <Kokkos_Core.hpp> + +#include <iostream> + +KOKKOS_RELOCATABLE_FUNCTION void count_even(const long i, long& lcount); + +int main(int argc, char* argv[]) { + Kokkos::ScopeGuard scope_guard(argc, argv); + + for (int n = 10; n <= 100'000'000; n *= 10) { + Kokkos::Timer timer; + + long count = 0; + // Compute the number of even integers from 0 to n-1 using a relocatable + // functor + Kokkos::parallel_reduce( + n, KOKKOS_LAMBDA(const long i, long& lcount) { count_even(i, lcount); }, + count); + + double count_time_relocatable = timer.seconds(); + + timer.reset(); + + // Compute the number of even integers from 0 to n-1 using an inline lambda + Kokkos::parallel_reduce( + n, + KOKKOS_LAMBDA(const long i, long& lcount) { lcount += (i % 2) == 0; }, + count); + + double count_time_inline = timer.seconds(); + std::cout << std::scientific << n * 1. << ' ' << count_time_relocatable + << "s (relocatable) vs. " << count_time_inline << "s (inline)\n"; + } +} diff --git a/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt b/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt index e1b90b133ddec1dc8a848a9ab8d2253980edd301..9a39169e2b49d637ecabbaecf399ee951996ab95 100644 --- a/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt @@ -1,10 +1,5 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_01_hello_world - SOURCES hello_world.cpp -) - +kokkos_add_executable(tutorial_01_hello_world SOURCES hello_world.cpp) diff --git a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp index 5b8a21af833bba0ab4d6d3bb6fbf3f3a52ce9e84..3104003fb4872812b94df50770d8b6ae510256f6 100644 --- a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp +++ b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp @@ -16,7 +16,6 @@ #include <Kokkos_Core.hpp> #include <cstdio> -#include <typeinfo> // // "Hello world" parallel_for example: @@ -25,12 +24,12 @@ // using a functor to define the loop body // 3. Shut down Kokkos // -// If Kokkos was built with C++11 enabled, try comparing this example -// to 01_hello_world_lambda. The latter uses C++11 lambdas (anonymous -// functions) to define the loop body of the parallel_for. That makes -// the code much more concise and readable. On the other hand, -// breaking out the loop body into an explicit functor makes it easier -// to test the loop independently of the parallel pattern. +// Try comparing this example to 01_hello_world_lambda, which uses +// C++11 lambdas (anonymous functions) to define the loop body of the +// parallel_for. That makes the code much more concise and readable. +// On the other hand, breaking out the loop body into an explicit +// functor makes it easier to test the loop independently of the +// parallel pattern. // // Functor that defines the parallel_for's loop body. @@ -58,12 +57,7 @@ struct hello_world { // is unnecessary but harmless. KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + Kokkos::printf("Hello from i = %i\n", i); } }; @@ -77,11 +71,9 @@ int main(int argc, char* argv[]) { // start with "--kokkos-". Kokkos::initialize(argc, argv); - // Print the name of Kokkos' default execution space. We're using - // typeid here, so the name might get a bit mangled by the linker, - // but you should still be able to figure out what it is. + // Print the name of Kokkos' default execution space. printf("Hello World on Kokkos execution space %s\n", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); // Run the above functor on the default Kokkos execution space in // parallel, with a parallel for loop count of 15. diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt index a939a5f0ded6d2953af557e6c62fe783ba7b559e..1acb463f48e3ed561dfa857e170b6bc5d2d08a99 100644 --- a/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt @@ -1,10 +1,5 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_01_hello_world_lambda - SOURCES hello_world_lambda.cpp -) - +kokkos_add_executable(tutorial_01_hello_world_lambda SOURCES hello_world_lambda.cpp) diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index c78f30763613215b88a78a34c6b5ece9fbfb6cdf..136af5eadfbf8ff1565c20650e42e28bd490ddac 100644 --- a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -16,7 +16,6 @@ #include <Kokkos_Core.hpp> #include <cstdio> -#include <typeinfo> // // "Hello world" parallel_for example: @@ -25,10 +24,9 @@ // using a C++11 lambda to define the loop body // 3. Shut down Kokkos // -// This example only builds if C++11 is enabled. Compare this example -// to 01_hello_world, which uses functors (explicitly defined classes) -// to define the loop body of the parallel_for. Both functors and -// lambdas have their places. +// Compare this example to 01_hello_world, which uses functors +// (explicitly defined classes) to define the loop body of the +// parallel_for. Both functors and lambdas have their places. // int main(int argc, char* argv[]) { @@ -41,11 +39,9 @@ int main(int argc, char* argv[]) { // start with "--kokkos-". Kokkos::initialize(argc, argv); - // Print the name of Kokkos' default execution space. We're using - // typeid here, so the name might get a bit mangled by the linker, - // but you should still be able to figure out what it is. + // Print the name of Kokkos' default execution space. printf("Hello World on Kokkos execution space %s\n", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); // Run lambda on the default Kokkos execution space in parallel, // with a parallel for loop count of 15. The lambda's argument is @@ -71,20 +67,13 @@ int main(int argc, char* argv[]) { // // You may notice that the printed numbers do not print out in // order. Parallel for loops may execute in any order. - // We also need to protect the usage of a lambda against compiling - // with a backend which doesn't support it (i.e. Cuda 6.5/7.0). -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( 15, KOKKOS_LAMBDA(const int i) { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - // printf works in a CUDA parallel kernel; std::ostream does not. - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + // Kokko::printf works for all backends in a parallel kernel; + // std::ostream does not. + Kokkos::printf("Hello from i = %i\n", i); }); -#endif + // You must call finalize() after you are done using Kokkos. Kokkos::finalize(); } diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt b/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt index 21b0c38014b4cd923d9c8ecea07ed645a2775c6e..376ed0ede1b19e66849838a411561a7271f3c2d5 100644 --- a/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt @@ -1,9 +1,5 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_02_simple_reduce - SOURCES simple_reduce.cpp -) +kokkos_add_executable(tutorial_02_simple_reduce SOURCES simple_reduce.cpp) diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt index 82a87be4bdc46baec421c0363a1481e1ae07e001..0fb41ca4bf195a88b0b3987bff2d0c6b06061a87 100644 --- a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt @@ -1,9 +1,4 @@ +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - -KOKKOS_ADD_EXECUTABLE( - tutorial_02_simple_reduce_lambda - SOURCES simple_reduce_lambda.cpp -) - +kokkos_add_executable(tutorial_02_simple_reduce_lambda SOURCES simple_reduce_lambda.cpp) diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp index 5cae6da16cf0325c350c6504168faa211334b2b5..b5e737010169a6205f79cbfcdd094d3dc265cbce 100644 --- a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp +++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp @@ -24,9 +24,8 @@ // using a C++11 lambda to define the loop body // 3. Shut down Kokkos // -// This example only builds if C++11 is enabled. Compare this example -// to 02_simple_reduce, which uses a functor to define the loop body -// of the parallel_reduce. +// Compare this example to 02_simple_reduce, which uses a functor to +// define the loop body of the parallel_reduce. // int main(int argc, char* argv[]) { @@ -38,14 +37,11 @@ int main(int argc, char* argv[]) { // functor. The lambda takes the same arguments as the functor's // operator(). int sum = 0; -// The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=]. -// It also handles any other syntax needed for CUDA. -// We also need to protect the usage of a lambda against compiling -// with a backend which doesn't support it (i.e. Cuda 6.5/7.0). -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + // The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=]. + // It also handles any other syntax needed for CUDA. Kokkos::parallel_reduce( n, KOKKOS_LAMBDA(const int i, int& lsum) { lsum += i * i; }, sum); -#endif + printf( "Sum of squares of integers from 0 to %i, " "computed in parallel, is %i\n", @@ -61,9 +57,6 @@ int main(int argc, char* argv[]) { "computed sequentially, is %i\n", n - 1, seqSum); Kokkos::finalize(); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + return (sum == seqSum) ? 0 : -1; -#else - return 0; -#endif } diff --git a/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt b/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt index 99a7d39c17ad35dc82eaddbe7fe60f8df15544f6..520468eb895f940a6e1681354d96730bcaf69d83 100644 --- a/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt @@ -1,9 +1,5 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_03_simple_view - SOURCES simple_view.cpp -) +kokkos_add_executable(tutorial_03_simple_view SOURCES simple_view.cpp) diff --git a/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp b/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp index 8be280f12c19083bbebd40461626d725f003f885..604c681faaeaff3b381e35cf0329bbaa90206f32 100644 --- a/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp +++ b/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp @@ -39,7 +39,7 @@ // // The first dimension of the View is the dimension over which it is // efficient for Kokkos to parallelize. -using view_type = Kokkos::View<double * [3]>; +using view_type = Kokkos::View<double* [3]>; // parallel_for functor that fills the View given to its constructor. // The View must already have been allocated. diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt index 2f3d9c52de63f6d864b0961ec59ed1d32c92ff05..bc775e0eebbf2013312bbc920964c36e77f04af3 100644 --- a/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt @@ -1,9 +1,5 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_03_simple_view_lambda - SOURCES simple_view_lambda.cpp - ) +kokkos_add_executable(tutorial_03_simple_view_lambda SOURCES simple_view_lambda.cpp) diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp b/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp index bdcd45b246e2904aea632550354170ce9ee7cb81..9fba1dddfdf0ef5f6337f46b7ec9543381de1b68 100644 --- a/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp +++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp @@ -38,7 +38,7 @@ // // The first dimension of the View is the dimension over which it is // efficient for Kokkos to parallelize. -using view_type = Kokkos::View<double * [3]>; +using view_type = Kokkos::View<double* [3]>; int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); @@ -61,19 +61,16 @@ int main(int argc, char* argv[]) { // Different Views may have the same label. view_type a("A", 10); -// Fill the View with some data. The parallel_for loop will iterate -// over the View's first dimension N. -// -// Note that the View is passed by value into the lambda. The macro -// KOKKOS_LAMBDA includes the "capture by value" clause [=]. This -// tells the lambda to "capture all variables in the enclosing scope -// by value." Views have "view semantics"; they behave like -// pointers, not like std::vector. Passing them by value does a -// shallow copy. A deep copy never happens unless you explicitly -// ask for one. -// We also need to protect the usage of a lambda against compiling -// with a backend which doesn't support it (i.e. Cuda 6.5/7.0). -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + // Fill the View with some data. The parallel_for loop will iterate + // over the View's first dimension N. + // + // Note that the View is passed by value into the lambda. The macro + // KOKKOS_LAMBDA includes the "capture by value" clause [=]. This + // tells the lambda to "capture all variables in the enclosing scope + // by value." Views have "view semantics"; they behave like + // pointers, not like std::vector. Passing them by value does a + // shallow copy. A deep copy never happens unless you explicitly + // ask for one. Kokkos::parallel_for( 10, KOKKOS_LAMBDA(const int i) { // Acesss the View just like a Fortran array. The layout depends @@ -92,7 +89,6 @@ int main(int argc, char* argv[]) { }, sum); printf("Result: %f\n", sum); -#endif } Kokkos::finalize(); } diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt b/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt index 03fb97a133caf6039dd048cb0546f502457ddc34..b4e8b59a98b5ac42cd1c5c3711a4e9fdfc6fc678 100644 --- a/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_04_simple_memoryspaces - SOURCES simple_memoryspaces.cpp -) +kokkos_add_executable(tutorial_04_simple_memoryspaces SOURCES simple_memoryspaces.cpp) diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp b/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp index fc9e48e51252beb57493641acccedc7935b219d2..1680652e2dff8f0e6e2728353f109eed3d9c6bff 100644 --- a/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp +++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp @@ -19,7 +19,7 @@ // The type of a two-dimensional N x 3 array of double. // It lives in Kokkos' default memory space. -using view_type = Kokkos::View<double * [3]>; +using view_type = Kokkos::View<double *[3]>; // The "HostMirror" type corresponding to view_type above is also a // two-dimensional N x 3 array of double. However, it lives in the diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt b/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt index 85870e5e504c1b74bfbccaa97e4b9f289d5a28c7..3e1b00e10c2fdc60508a275b126c0f7aee06211b 100644 --- a/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt @@ -1,10 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_05_simple_atomics - SOURCES simple_atomics.cpp -) - +kokkos_add_executable(tutorial_05_simple_atomics SOURCES simple_atomics.cpp) diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt index 2a6c3f6c27a3699d0715c8e5ab41448221432aaf..287ae18afba82ff3b2c05350b8a98d00bdc1f932 100644 --- a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_06_simple_mdrangepolicy - SOURCES simple_mdrangepolicy.cpp -) +kokkos_add_executable(tutorial_06_simple_mdrangepolicy SOURCES simple_mdrangepolicy.cpp) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt index b0db41bf451708c361eb5a0cdeda3413c8532c8e..9f0324b475cc3f287f327cca95ea91a0120faf4b 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_advancedviews_01_data_layouts - SOURCES data_layouts.cpp -) +kokkos_add_executable(tutorial_advancedviews_01_data_layouts SOURCES data_layouts.cpp) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt index 0e50968b4bf2d856b0a1e4b1122929db5aab5326..8ecaf99a95a4cab8ee7d562cf71e4260c26b4e0b 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_advancedviews_02_memory_traits - SOURCES memory_traits.cpp -) +kokkos_add_executable(tutorial_advancedviews_02_memory_traits SOURCES memory_traits.cpp) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt index 90270740c3c49927221ebca87a7c85f16afea074..b392564d93c938ea6128afb76b65d6de8801ca3a 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_advancedviews_03_subviews - SOURCES subviews.cpp -) +kokkos_add_executable(tutorial_advancedviews_03_subviews SOURCES subviews.cpp) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt index 4f9b9225d2114fa52141322993297a8ac92713c8..bd55c0d6cfca9a4455f1d834a84ee75a5cb826de 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_advancedviews_04_dualviews - SOURCES dual_view.cpp -) +kokkos_add_executable(tutorial_advancedviews_04_dualviews SOURCES dual_view.cpp) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp index 3bf8b3dbf3e809cf7673c3343b8d08d389170ad9..30fe6e16b74d2c597879a44a6b4b759241435b8c 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp +++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp @@ -48,9 +48,9 @@ struct localsum { // overrides Kokkos' default execution space. using execution_space = ExecutionSpace; - using memory_space = typename Kokkos::Impl::if_c< - std::is_same<ExecutionSpace, Kokkos::DefaultExecutionSpace>::value, - idx_type::memory_space, idx_type::host_mirror_space>::type; + using memory_space = std::conditional_t< + std::is_same_v<ExecutionSpace, Kokkos::DefaultExecutionSpace>, + idx_type::memory_space, idx_type::host_mirror_space>; // Get the view types on the particular device for which the functor // is instantiated. diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt index 9e9af9872c907256f37de501e6c5783fe8074647..e9e71d523543e03768b41af01791b00e35c7532f 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt @@ -1,11 +1,7 @@ +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - -IF (Kokkos_ENABLE_CUDA_UVM) -# This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_advancedviews_05_nvidia_uvm - SOURCES uvm_example.cpp -) -ENDIF () +if(Kokkos_ENABLE_CUDA_UVM) + # This is a tutorial, not a test, so we don't ask CTest to run it. + kokkos_add_executable(tutorial_advancedviews_05_nvidia_uvm SOURCES uvm_example.cpp) +endif() diff --git a/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt index 11da617b8fa0bab71b80ffadae3d8f22b79dd510..e77a454e136a901e6bb24d3c709775c0f32cf806 100644 --- a/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt @@ -1,9 +1,8 @@ +kokkos_add_example_directories(01_data_layouts) +kokkos_add_example_directories(02_memory_traits) +kokkos_add_example_directories(03_subviews) +kokkos_add_example_directories(04_dualviews) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_data_layouts) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_memory_traits) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_subviews) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(04_dualviews) - -IF (Kokkos_ENABLE_CUDA_UVM) - KOKKOS_ADD_EXAMPLE_DIRECTORIES(05_NVIDIA_UVM) -ENDIF () +if(Kokkos_ENABLE_CUDA_UVM) + kokkos_add_example_directories(05_NVIDIA_UVM) +endif() diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/CMakeLists.txt b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..36f58bf7622528838ecfcdb88eea57c3afd124d2 --- /dev/null +++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/CMakeLists.txt @@ -0,0 +1,5 @@ +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +kokkos_add_executable(tutorial_algorithms_01_random_numbers SOURCES random_numbers.cpp) diff --git a/packages/kokkos/example/tutorial/Algorithms/CMakeLists.txt b/packages/kokkos/example/tutorial/Algorithms/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c610179fa87bb3532eca7cfdd9380e0f09224b7 --- /dev/null +++ b/packages/kokkos/example/tutorial/Algorithms/CMakeLists.txt @@ -0,0 +1 @@ +kokkos_add_example_directories(01_random_numbers) diff --git a/packages/kokkos/example/tutorial/CMakeLists.txt b/packages/kokkos/example/tutorial/CMakeLists.txt index efbf8614a0df9a2c1816f2f830ab0a95b2bfccd4..f096b9ef63e3c2174dd569b19d0005de04a1f36a 100644 --- a/packages/kokkos/example/tutorial/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/CMakeLists.txt @@ -1,14 +1,14 @@ +kokkos_add_example_directories(01_hello_world) +kokkos_add_example_directories(02_simple_reduce) +kokkos_add_example_directories(03_simple_view) +kokkos_add_example_directories(04_simple_memoryspaces) +kokkos_add_example_directories(05_simple_atomics) +kokkos_add_example_directories(06_simple_mdrangepolicy) +kokkos_add_example_directories(Advanced_Views) +kokkos_add_example_directories(Algorithms) +kokkos_add_example_directories(Hierarchical_Parallelism) +kokkos_add_example_directories(launch_bounds) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_hello_world) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_simple_view) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(06_simple_mdrangepolicy) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism) - -KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_hello_world_lambda) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce_lambda) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_simple_view_lambda) - +kokkos_add_example_directories(01_hello_world_lambda) +kokkos_add_example_directories(02_simple_reduce_lambda) +kokkos_add_example_directories(03_simple_view_lambda) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt index e7cd6dea07fc86821911e265ee276aee331f97d5..755d57dc5250ad0c70b7bc91077692579e458704 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_hierarchicalparallelism_01_thread_teams - SOURCES thread_teams.cpp -) +kokkos_add_executable(tutorial_hierarchicalparallelism_01_thread_teams SOURCES thread_teams.cpp) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp index b041f8d435b9586c42f59806ad7c2c3d159468f7..ee3f4721d917e3c90adf98dbfc793fe1030d9e71 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -47,13 +47,9 @@ struct hello_world { // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and // the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); } }; diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt index 8c7f3853a014cfe71750060e6d077e1e4f0d777b..980efccf96bd31237bb5069556179bea5fd25c62 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt @@ -1,10 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_hierarchical_01_thread_teams_lambda - SOURCES thread_teams_lambda.cpp -) - +kokkos_add_executable(tutorial_hierarchical_01_thread_teams_lambda SOURCES thread_teams_lambda.cpp) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 933b254f7c7ba9e2ce8e916c7d9c55e301c7b174..5ef9908f06f0433eb2597089e93ead80cd8a8642 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -50,26 +50,19 @@ int main(int narg, char* args[]) { // region." That is, every team member is active and will execute // the body of the lambda. int sum = 0; -// We also need to protect the usage of a lambda against compiling -// with a backend which doesn't support it (i.e. Cuda 6.5/7.0). -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) parallel_reduce( policy, KOKKOS_LAMBDA(const team_member& thread, int& lsum) { lsum += 1; - // TeamPolicy<>::member_type provides functions to query the - // multidimensional index of a thread, as well as the number of - // thread teams and the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs workaround for printf - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); }, sum); -#endif + // The result will be 12*team_policy::team_size_max([=]{}) printf("Result %i\n", sum); diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt index 92b701e4f430f1335c9f478ede4955b1b28626f6..0bc2336ebb99f3e3b345aa6ea178a62fdfcbc058 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_hierarchicalparallelism_02_nested_parallel_for - SOURCES nested_parallel_for.cpp -) +kokkos_add_executable(tutorial_hierarchicalparallelism_02_nested_parallel_for SOURCES nested_parallel_for.cpp) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp index 398810d133195c9534ded415e6ea92b1be70968a..75d6089e9af470badeee47b900302196968cfabe 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -43,16 +43,11 @@ struct hello_world { // the operator using a team_policy acts like a parallel region for the // team. That means that everything outside of the nested parallel_for is // also executed by all threads of the team. - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), - [&](const int& i) { -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: (%i , %i) executed loop %i \n", - thread.league_rank(), thread.team_rank(), i); -#else - (void) i; -#endif - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, 31), [&](const int& i) { + Kokkos::printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); + }); } }; diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt index 3907d1666486036b512440ea745beb75165104e8..3bf08dda8b17fcce23414e3f83f495a51ea61af8 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt @@ -1,10 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_hierarchicalparallelism_03_vectorization - SOURCES vectorization.cpp -) - +kokkos_add_executable(tutorial_hierarchicalparallelism_03_vectorization SOURCES vectorization.cpp) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt index d2f83a25eab38a0ed6386f97bf6efec915230593..11c746aa21736454cee35da554fe75ec9b3184f0 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt @@ -1,10 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_hierarchicalparallelism_04_team_scan - SOURCES team_scan.cpp -) - +kokkos_add_executable(tutorial_hierarchicalparallelism_04_team_scan SOURCES team_scan.cpp) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt index c892df34cd5067fa54b728624a654b9d65711761..03f3cb63d2a9a21cb74b396212fffcc300fbc496 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt @@ -1,6 +1,4 @@ - -KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for) -KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_vectorization) - +kokkos_add_example_directories(01_thread_teams) +kokkos_add_example_directories(01_thread_teams_lambda) +kokkos_add_example_directories(02_nested_parallel_for) +kokkos_add_example_directories(03_vectorization) diff --git a/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt b/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt index 3d9683500dd472c8f8a8dd95c6a641f861f94545..8339e9e3b3f7cee6430dfdcae0e64e0cbd892b72 100644 --- a/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt +++ b/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt @@ -1,9 +1,5 @@ - -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # This is a tutorial, not a test, so we don't ask CTest to run it. -KOKKOS_ADD_EXECUTABLE( - tutorial_02_simple_reduce - SOURCES simple_reduce.cpp -) +kokkos_add_executable(launch_bounds_reduce SOURCES launch_bounds_reduce.cpp) diff --git a/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp b/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp index 9471d3c16ac54bba57abad06c7189a51f7adbd02..db651ba77ff69e89c0c65776723a296eb6a7c88e 100644 --- a/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp +++ b/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp @@ -38,8 +38,9 @@ struct collision { int hash(int q) const { // A simple hash by Justin Sobel // Thanks to Arash Partow (partow.net) - char* fourchars = (char*)&q; - int hash = 1315423911; + char* fourchars = + (char*)&q; // NOLINT(cppcoreguidelines-pro-type-cstyle-cast) + int hash = 1315423911; for (int i = 0; i < 4; fourchars++, i++) { hash ^= ((hash << 5) + *fourchars + (hash >> 2)); } diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash index 1b216d9fe35843878bdb6c85c121ddb70efd7e2f..70dd61f9af0865e2c343dc30b5917c54a0a3b120 100755 --- a/packages/kokkos/generate_makefile.bash +++ b/packages/kokkos/generate_makefile.bash @@ -160,21 +160,20 @@ display_help_text() { echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906" echo " AMD_GFX908 = AMD GPU MI100 GFX908" echo " AMD_GFX90A = AMD GPU MI200 GFX90A" + echo " AMD_GFX940 = AMD GPU MI300 GFX940" echo " AMD_GFX942 = AMD GPU MI300 GFX942" echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030" echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100" + echo " AMD_GFX1103 = AMD APU Radeon 740M/760M/780M/880M/890M GFX1103" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" echo " ARMV8_THUNDERX = ARMv8 Cavium ThunderX CPU" echo " ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -225,7 +224,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -341,10 +339,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -451,15 +445,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then if [ "${KOKKOS_PATH}" == "" ]; then CM_SCRIPT=$0 @@ -505,5 +490,5 @@ if [[ ${COMPILER} == *clang* ]]; then fi fi -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash index 5ea159cdd47fc48b0872ae7c3061f176e9a88436..7a197bb71d465c0968c7974c35befaba5a8345b5 100755 --- a/packages/kokkos/gnu_generate_makefile.bash +++ b/packages/kokkos/gnu_generate_makefile.bash @@ -74,9 +74,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -148,12 +145,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -198,7 +192,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -298,11 +291,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt index fd0020b8d53b7bd584a6d63bece6e428d1220cd1..c9e454c1af0f44cc99053c36addf5a9e5738d9eb 100644 --- a/packages/kokkos/master_history.txt +++ b/packages/kokkos/master_history.txt @@ -34,3 +34,10 @@ tag: 4.0.00 date: 02:23:2023 master: 5ad60966 release: 52ea2953 tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88 +tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a +tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 +tag: 4.3.01 date: 05:07:2024 master: 486cc745 release: 262d2d6e +tag: 4.4.00 date: 08:08:2024 master: 6ecdf605 release: 6068673c +tag: 4.4.01 date: 09:12:2024 master: 08ceff92 release: 2d60c039 +tag: 4.5.00 date: 11:11:2024 master: 15dc143e release: 5164f2f6 +tag: 4.5.01 date: 12:19:2024 master: 09e775bf release: e0d656f9 diff --git a/packages/kokkos/scripts/apply-clang-format b/packages/kokkos/scripts/apply-clang-format index 7f7fb82686e448516625b65fdb2350cff0341aba..411b5dc5b69de0f76eea5cf7d3cecd6184f20729 100755 --- a/packages/kokkos/scripts/apply-clang-format +++ b/packages/kokkos/scripts/apply-clang-format @@ -13,8 +13,8 @@ CLANG_FORMAT_VERSION="$(${CLANG_FORMAT_EXECUTABLE} --version)" CLANG_FORMAT_MAJOR_VERSION=$(echo "${CLANG_FORMAT_VERSION}" | sed 's/^[^0-9]*\([0-9]*\).*$/\1/g') CLANG_FORMAT_MINOR_VERSION=$(echo "${CLANG_FORMAT_VERSION}" | sed 's/^[^0-9]*[0-9]*\.\([0-9]*\).*$/\1/g') -if [ "${CLANG_FORMAT_MAJOR_VERSION}" -ne 8 ] || [ "${CLANG_FORMAT_MINOR_VERSION}" -ne 0 ]; then - echo "*** This indent script requires clang-format version 8.0," +if [ "${CLANG_FORMAT_MAJOR_VERSION}" -ne 16 ] || [ "${CLANG_FORMAT_MINOR_VERSION}" -ne 0 ]; then + echo "*** This indent script requires clang-format version 16.0," echo "*** but version ${CLANG_FORMAT_MAJOR_VERSION}.${CLANG_FORMAT_MINOR_VERSION} was found instead." exit 1 fi diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang index 5c6abc1c6de53df90d27186b05450fd9eb370dd3..93a45af33a2a18b04a107433110dd9a935acafa5 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.clang +++ b/packages/kokkos/scripts/docker/Dockerfile.clang @@ -1,49 +1,13 @@ -FROM ubuntu:18.04 +FROM ubuntu:24.04@sha256:2e863c44b718727c860746568e1d54afd13b2fa71b160f5cd9058fc436217b30 RUN apt-get update && apt-get install -y \ bc \ git \ build-essential \ + clang-format-16 \ wget \ - ccache \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ - KEYDUMP_FILE=keydump && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ - gpg --import ${KEYDUMP_FILE} && \ - gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \ - rm ${KEYDUMP_FILE}* - -ARG CMAKE_VERSION=3.16.8 -ENV CMAKE_DIR=/opt/cmake -RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ - CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ - CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ - gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ - grep -i ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sed -e s/linux/Linux/ | sha256sum --check && \ - mkdir -p ${CMAKE_DIR} && \ - sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ - rm cmake* -ENV PATH=${CMAKE_DIR}/bin:$PATH - -ENV LLVM_DIR=/opt/llvm -RUN LLVM_VERSION=8.0.0 && \ - LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \ - LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \ - SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ - wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ - wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \ - gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \ - mkdir -p ${LLVM_DIR} && \ - tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \ - echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ - rm -rf /root/.gnupg && \ - rm -rf ${SCRATCH_DIR} -ENV PATH=${LLVM_DIR}/bin:$PATH +ENV CLANG_FORMAT_EXE=clang-format-16 diff --git a/packages/kokkos/scripts/docker/Dockerfile.gcc b/packages/kokkos/scripts/docker/Dockerfile.gcc index 3cfb39dc20ba48417be4fe8100b1eb8dc283ad4f..3bca9834b5247e5662abdf68474df2813b525751 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.gcc +++ b/packages/kokkos/scripts/docker/Dockerfile.gcc @@ -1,4 +1,4 @@ -FROM ubuntu:20.04 +FROM ubuntu:20.04@sha256:0b897358ff6624825fb50d20ffb605ab0eaea77ced0adb8c6a4b756513dec6fc ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get upgrade -y && apt-get install -y \ @@ -18,7 +18,7 @@ RUN echo "deb http://dk.archive.ubuntu.com/ubuntu/ xenial main" >> /etc/apt/sour apt-get clean && rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ diff --git a/packages/kokkos/scripts/docker/Dockerfile.hipcc b/packages/kokkos/scripts/docker/Dockerfile.hipcc index 95f76fe89a355d5f6964dc8847a987b50fc455cf..f8d3851d749bd31f8f217d53c37e76caa9e21bca 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.hipcc +++ b/packages/kokkos/scripts/docker/Dockerfile.hipcc @@ -1,4 +1,4 @@ -ARG BASE=rocm/dev-ubuntu-20.04:5.2 +ARG BASE=rocm/dev-ubuntu-20.04:5.2-complete@sha256:4030c8af0c06c286174758523dabe4b3850bf72d4a8c1ef275d3ec69aa475f65 FROM $BASE RUN apt-get update && apt-get install -y \ @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y \ ENV PATH=/opt/rocm/bin:$PATH -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject index a46d00f019da02718df10f3cad268cf2560b52d0..2a29e0041c4c7cd16ea2b83000bcbb9d14f41549 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject +++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.0.3-devel +FROM nvcr.io/nvidia/cuda:11.0.3-devel-ubuntu18.04@sha256:02d08888085d98c3c41b4db46e0f6b9e22671a70c1a2ff035ea91023effabff5 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub @@ -22,7 +22,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvcc b/packages/kokkos/scripts/docker/Dockerfile.nvcc index cbae8e47dde923705a1065c9330bbc11d0febd16..e87f7dcb055b1a2d2bf8dd3af8c4a4bb9af45a4c 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.nvcc +++ b/packages/kokkos/scripts/docker/Dockerfile.nvcc @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:9.2-devel +ARG BASE=nvcr.io/nvidia/cuda:11.0.3-devel-ubuntu20.04@sha256:10ab0f09fcdc796b4a2325ef1bce8f766f4a3500eab5a83780f80475ae26c7a6 FROM $BASE ARG ADDITIONAL_PACKAGES @@ -14,7 +14,7 @@ RUN apt-get update && apt-get install -y \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvhpc b/packages/kokkos/scripts/docker/Dockerfile.nvhpc index 88e59de2827af09a36f368c1d8522e7eae958f74..5f611e98d8471bd03812a615dac0e5f15f4ed601 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.nvhpc +++ b/packages/kokkos/scripts/docker/Dockerfile.nvhpc @@ -1,7 +1,7 @@ ARG BASE=nvcr.io/nvidia/nvhpc:23.7-devel-cuda12.2-ubuntu20.04 FROM $BASE -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget index 708cf533b8a64862a4f4dc1818f28bc2bba495c4..0d278972aef746c8d9994bc7b22364098da910a8 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget +++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:11.1.1-devel-ubuntu20.04 +ARG BASE=nvcr.io/nvidia/cuda:12.3.2-devel-ubuntu22.04@sha256:b3acdfb50afe62e6c367eba59ecf2d1768f9f174a62e005d282f843779721cb0 FROM $BASE RUN apt-get update && apt-get install -y \ @@ -15,7 +15,7 @@ RUN apt-get update && apt-get install -y \ ARG NPROC=8 -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ @@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO rm ${CMAKE_SCRIPT} ENV PATH=${CMAKE_DIR}/bin:$PATH -ARG LLVM_VERSION=llvmorg-17.0.1 +ARG LLVM_VERSION=llvmorg-17.0.3 ENV LLVM_DIR=/opt/llvm RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\ @@ -55,8 +55,8 @@ RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ -DCMAKE_CXX_COMPILER=g++ \ -DLLVM_ENABLE_PROJECTS="clang" \ -DLLVM_ENABLE_RUNTIMES="openmp" \ - -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_70 \ - -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=70 \ + -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ + -DLIBOMPTARGET_DEVICE_ARCHITECTURES=sm_70 \ ../llvm && \ make -j${NPROC} && \ make install && \ diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl index 714461bfe6a53ceb0c83656447bb89267d31213d..1e653e0878c70c96dd8c50aa4d2a4418e1c0233d 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.sycl +++ b/packages/kokkos/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:11.7.1-devel-ubuntu22.04 +ARG BASE=nvcr.io/nvidia/cuda:11.7.1-devel-ubuntu22.04@sha256:a3184e4dc6f968da5bba86df3081ff3013f8e3674a9bfce544af8be905d2f17a FROM $BASE RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub @@ -15,7 +15,7 @@ RUN apt-get update && apt-get install -y \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ +RUN KEYDUMP_URL=https://cloud1.cees.ornl.gov/download && \ KEYDUMP_FILE=keydump && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ @@ -46,12 +46,31 @@ RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCT apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN wget https://cloud.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ +RUN wget https://cloud1.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ + echo "3416721faf83e5858e65795231bae47bb51ff91d4e8738613d498674f1636f72 oneapi-for-nvidia-gpus-2023.0.0-linux.sh" | sha256sum --check && \ chmod +x oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ ./oneapi-for-nvidia-gpus-2023.0.0-linux.sh -y && \ rm oneapi-for-nvidia-gpus-2023.0.0-linux.sh -RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_oneDPL_p_2022.0.0.25335.sh &&\ - chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \ - ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \ - rm l_oneDPL_p_2022.0.0.25335.sh +ENV ONE_DPL_DIR=/opt/onedpl +RUN . /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ + ONE_DPL_VERSION=oneDPL-2022.2.0 && \ + ONE_DPL_URL=https://github.com/oneapi-src/oneDPL/archive && \ + ONE_DPL_ARCHIVE=${ONE_DPL_VERSION}-rc1.tar.gz && \ + SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ + wget --quiet ${ONE_DPL_URL}/${ONE_DPL_ARCHIVE} && \ + mkdir onedpl && \ + tar -xf ${ONE_DPL_ARCHIVE} -C onedpl --strip-components=1 && cd onedpl && \ + mkdir build && cd build && \ + cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_CXX_FLAGS="-w" -DCMAKE_INSTALL_PREFIX=${ONE_DPL_DIR} -DCMAKE_SKIP_INSTALL_ALL_DEPENDENCY=TRUE -DONEDPL_BACKEND="dpcpp_only" .. && \ + make -j${NPROCS} install && \ + rm -rf ${SCRATCH_DIR} + +# clang++ +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin-llvm/:$PATH +# sycl-ls, icpx +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin/:$PATH +# libsycl +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/lib:$LD_LIBRARY_PATH +# libsvml +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH diff --git a/packages/kokkos/scripts/spack_test/CMakeLists.txt b/packages/kokkos/scripts/spack_test/CMakeLists.txt index 4c28bd0b8eccff2487ae1388960bbbc6b8504a34..20274432e4b148ab863439329b63735645b1074f 100644 --- a/packages/kokkos/scripts/spack_test/CMakeLists.txt +++ b/packages/kokkos/scripts/spack_test/CMakeLists.txt @@ -4,16 +4,16 @@ set(TEST_LIST_DEF ${CMAKE_CURRENT_SOURCE_DIR}/test_list.def) file(STRINGS ${TEST_LIST_DEF} TEST_FILES) #Copy test source to Spack test directory -foreach (TEST_FILE ${TEST_FILES}) +foreach(TEST_FILE ${TEST_FILES}) set(TEST_FILE_LOCATION ${SPACK_PACKAGE_SOURCE_DIR}/${TEST_FILE}) file(COPY ${TEST_FILE_LOCATION} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/out) endforeach() #Clean up names -foreach(TEST_FILE ${TEST_FILES} ) - string( REGEX REPLACE ".+\/" "" TEST_FILE ${TEST_FILE} ) +foreach(TEST_FILE ${TEST_FILES}) + string(REGEX REPLACE ".+\/" "" TEST_FILE ${TEST_FILE}) list(APPEND SRC_NAME_LIST ${TEST_FILE}) - string( REPLACE ".cpp" "" TEST_FILE ${TEST_FILE} ) + string(REPLACE ".cpp" "" TEST_FILE ${TEST_FILE}) list(APPEND BIN_NAME_LIST ${TEST_FILE}) endforeach() diff --git a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash index ae1db3186f72df9295535356bfea9a272c861648..830d7b12d904527991952f5a6b78fc518e0c6b8e 100755 --- a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash +++ b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash @@ -59,9 +59,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -136,12 +133,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -177,7 +171,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -269,11 +262,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia b/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia index 57929e897267eb1f12f63de8114f5f9779325254..5394f917a286413304ac83130b6727cd040b23e6 100755 --- a/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia +++ b/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia @@ -197,7 +197,7 @@ if [ "$MACHINE" = "sems" ]; then # On unnamed sems machines, assume more restricted rhel7 environment # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing - # Remove kokkkos-env module use + # Remove kokkos-env module use BASE_MODULE_LIST="sems-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>" CUDA9_MODULE_LIST="sems-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0" diff --git a/packages/kokkos/simd/CMakeLists.txt b/packages/kokkos/simd/CMakeLists.txt index 59e09b85ac3df05da80fd102cee7e51fc5fac9fe..916250ae6275c9072eedcf3ba8cdd0e4dd0ac958 100644 --- a/packages/kokkos/simd/CMakeLists.txt +++ b/packages/kokkos/simd/CMakeLists.txt @@ -1,5 +1,5 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) +kokkos_add_test_directories(unit_tests) diff --git a/packages/kokkos/simd/src/CMakeLists.txt b/packages/kokkos/simd/src/CMakeLists.txt index 8779112bc3c88dcc6752b215c781d356aaa8fe40..ba44f0f8e0010999a8dd8b81032aa6ca553e36c5 100644 --- a/packages/kokkos/simd/src/CMakeLists.txt +++ b/packages/kokkos/simd/src/CMakeLists.txt @@ -1,29 +1,24 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB SIMD_HEADERS *.hpp) -FILE(GLOB SIMD_SOURCES *.cpp) +file(GLOB SIMD_HEADERS *.hpp) +file(GLOB SIMD_SOURCES *.cpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_LIBRARY( - kokkossimd - SOURCES ${SIMD_SOURCES} - HEADERS ${SIMD_HEADERS} -) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkossimd - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_add_library(kokkossimd SOURCES ${SIMD_SOURCES} HEADERS ${SIMD_HEADERS}) +kokkos_lib_include_directories( + kokkossimd ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/packages/kokkos/simd/src/Kokkos_SIMD.hpp b/packages/kokkos/simd/src/Kokkos_SIMD.hpp index 57d4afd88beee86e9bc3c20a412fcd206ee03045..a57d7c3026214833d884528cc6eb3dc4054876e9 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD.hpp @@ -154,7 +154,7 @@ struct ForSpace<Kokkos::HIP> { #ifdef KOKKOS_ENABLE_SYCL template <> -struct ForSpace<Kokkos::Experimental::SYCL> { +struct ForSpace<Kokkos::SYCL> { using type = scalar; }; #endif @@ -183,15 +183,18 @@ template <typename... Ts> class data_types {}; #if defined(KOKKOS_ARCH_AVX512XEON) -using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>>; +using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx512_fixed_size<8>, + simd_abi::avx512_fixed_size<16>>; using data_type_set = data_types<std::int32_t, std::uint32_t, std::int64_t, std::uint64_t, double, float>; #elif defined(KOKKOS_ARCH_AVX2) -using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx2_fixed_size<4>>; +using host_abi_set = abi_set<simd_abi::scalar, simd_abi::avx2_fixed_size<4>, + simd_abi::avx2_fixed_size<8>>; using data_type_set = data_types<std::int32_t, std::int64_t, std::uint64_t, double, float>; #elif defined(KOKKOS_ARCH_ARM_NEON) -using host_abi_set = abi_set<simd_abi::scalar, simd_abi::neon_fixed_size<2>>; +using host_abi_set = abi_set<simd_abi::scalar, simd_abi::neon_fixed_size<2>, + simd_abi::neon_fixed_size<4>>; using data_type_set = data_types<std::int32_t, std::int64_t, std::uint64_t, double, float>; #else diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp index 521160b76fc421e4c957c650d3e1527dc878ecbc..bc4a7dd187ae1d687f360c008c71545e2edba69f 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp @@ -30,9 +30,11 @@ "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" #endif -// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here. -#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \ - ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7)) +// FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used here. +#if defined(__HIPCC__) && \ + (((HIP_VERSION_MAJOR == 5) && \ + ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))) || \ + ((HIP_VERSION_MAJOR == 6) && ((HIP_VERSION_MINOR == 0)))) #define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE #endif @@ -226,6 +228,106 @@ class simd_mask<float, simd_abi::avx2_fixed_size<4>> { } }; +template <> +class simd_mask<float, simd_abi::avx2_fixed_size<8>> { + __m256 m_value; + + public: + class reference { + __m256& m_mask; + int m_lane; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __m256 bit_mask() const { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + return _mm256_cvtepi32_ps(_mm256_setr_epi32( +#else + return _mm256_castsi256_ps(_mm256_setr_epi32( +#endif + -std::int32_t(m_lane == 0), -std::int32_t(m_lane == 1), + -std::int32_t(m_lane == 2), -std::int32_t(m_lane == 3), + -std::int32_t(m_lane == 4), -std::int32_t(m_lane == 5), + -std::int32_t(m_lane == 6), -std::int32_t(m_lane == 7))); + } + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__m256& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + if (value) { + m_mask = _mm256_or_ps(bit_mask(), m_mask); + } else { + m_mask = _mm256_andnot_ps(bit_mask(), m_mask); + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + return (_mm256_movemask_ps(m_mask) & (1 << m_lane)) != 0; + } + }; + using value_type = bool; + using abi_type = simd_abi::avx2_fixed_size<8>; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) + : m_value(_mm256_castsi256_ps(_mm256_set1_epi32(-std::int32_t(value)))) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + G&& gen) noexcept + : m_value(_mm256_castsi256_ps(_mm256_setr_epi32( + -std::int32_t(gen(std::integral_constant<std::size_t, 0>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 1>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 2>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 3>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 4>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 5>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 6>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 7>()))))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + __m256 const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast<value_type>( + reference(const_cast<__m256&>(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator||(simd_mask const& other) const { + return simd_mask(_mm256_or_ps(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator&&(simd_mask const& other) const { + return simd_mask(_mm256_and_ps(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const { + auto const true_value = static_cast<__m256>(simd_mask(true)); + return simd_mask(_mm256_andnot_ps(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + simd_mask const& other) const { + return _mm256_movemask_ps(m_value) == _mm256_movemask_ps(other.m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + simd_mask const& other) const { + return !operator==(other); + } +}; + template <> class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> { __m128i m_value; @@ -259,9 +361,7 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<4>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm_set1_epi32(-std::int32_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { @@ -322,6 +422,107 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> { } }; +template <> +class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>> { + __m256i m_value; + + public: + class reference { + __m256i& m_mask; + int m_lane; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __m256i bit_mask() const { + return _mm256_setr_epi32( + -std::int32_t(m_lane == 0), -std::int32_t(m_lane == 1), + -std::int32_t(m_lane == 2), -std::int32_t(m_lane == 3), + -std::int32_t(m_lane == 4), -std::int32_t(m_lane == 5), + -std::int32_t(m_lane == 6), -std::int32_t(m_lane == 7)); + } + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__m256i& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + if (value) { + m_mask = _mm256_or_si256(bit_mask(), m_mask); + } else { + m_mask = _mm256_andnot_si256(bit_mask(), m_mask); + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + return (_mm256_movemask_ps(_mm256_castsi256_ps(m_mask)) & + (1 << m_lane)) != 0; + } + }; + using value_type = bool; + using abi_type = simd_abi::avx2_fixed_size<8>; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) + : m_value(_mm256_set1_epi32(-std::int32_t(value))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + __m256i const& value_in) + : m_value(value_in) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + G&& gen) noexcept + : m_value(_mm256_setr_epi32( + -std::int32_t(gen(std::integral_constant<std::size_t, 0>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 1>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 2>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 3>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 4>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 5>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 6>())), + -std::int32_t(gen(std::integral_constant<std::size_t, 7>())))) {} + template <class U> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask<U, abi_type> const& other) { + for (std::size_t i = 0; i < size(); ++i) (*this)[i] = other[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast<value_type>( + reference(const_cast<__m256i&>(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator||(simd_mask const& other) const { + return simd_mask(_mm256_or_si256(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator&&(simd_mask const& other) const { + return simd_mask(_mm256_and_si256(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const { + auto const true_value = static_cast<__m256i>(simd_mask(true)); + return simd_mask(_mm256_andnot_si256(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + simd_mask const& other) const { + return _mm256_movemask_ps(_mm256_castsi256_ps(m_value)) == + _mm256_movemask_ps(_mm256_castsi256_ps(other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + simd_mask const& other) const { + return !operator==(other); + } +}; + template <> class simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> { __m256i m_value; @@ -356,9 +557,7 @@ class simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<4>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm256_set1_epi64x(-std::int64_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { @@ -525,11 +724,11 @@ class simd<double, simd_abi::avx2_fixed_size<4>> { using abi_type = simd_abi::avx2_fixed_size<4>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 4; } @@ -563,16 +762,24 @@ class simd<double, simd_abi::avx2_fixed_size<4>> { element_aligned_tag) { m_value = _mm256_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256d() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd( _mm256_sub_pd(_mm256_set1_pd(0.0), static_cast<__m256d>(m_value))); } @@ -782,19 +989,19 @@ class simd<float, simd_abi::avx2_fixed_size<4>> { using abi_type = simd_abi::avx2_fixed_size<4>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 4; } - template <typename U, std::enable_if_t<std::is_convertible_v<U, value_type>, - bool> = false> + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) : m_value(_mm_set1_ps(value_type(value))) {} - template <typename G, + template <class G, std::enable_if_t< std::is_invocable_r_v<value_type, G, std::integral_constant<std::size_t, 0>>, @@ -818,16 +1025,24 @@ class simd<float, simd_abi::avx2_fixed_size<4>> { element_aligned_tag) { m_value = _mm_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(_mm_sub_ps(_mm_set1_ps(0.0), m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( @@ -1013,43 +1228,44 @@ namespace Experimental { } template <> -class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> { - __m128i m_value; +class simd<float, simd_abi::avx2_fixed_size<8>> { + __m256 m_value; public: - using value_type = std::int32_t; - using abi_type = simd_abi::avx2_fixed_size<4>; + using value_type = float; + using abi_type = simd_abi::avx2_fixed_size<8>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { - return 4; + return 8; } template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm_set1_epi32(value_type(value))) {} + : m_value(_mm256_set1_ps(value_type(value))) {} template <class G, std::enable_if_t< std::is_invocable_r_v<value_type, G, std::integral_constant<std::size_t, 0>>, bool> = false> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - G&& gen) noexcept - : m_value(_mm_setr_epi32(gen(std::integral_constant<std::size_t, 0>()), + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) + : m_value(_mm256_setr_ps(gen(std::integral_constant<std::size_t, 0>()), gen(std::integral_constant<std::size_t, 1>()), gen(std::integral_constant<std::size_t, 2>()), - gen(std::integral_constant<std::size_t, 3>()))) { + gen(std::integral_constant<std::size_t, 3>()), + gen(std::integral_constant<std::size_t, 4>()), + gen(std::integral_constant<std::size_t, 5>()), + gen(std::integral_constant<std::size_t, 6>()), + gen(std::integral_constant<std::size_t, 7>()))) { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m128i const& value_in) + __m256 const& value_in) : m_value(value_in) {} - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( - simd<std::uint64_t, abi_type> const& other); KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast<value_type*>(&m_value)[i]; } @@ -1059,137 +1275,614 @@ class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. -#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE - m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); -#else - m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); -#endif + m_value = _mm256_loadu_ps(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_ps(ptr); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + _mm256_storeu_ps(ptr, m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_ps(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator==(simd const& lhs, simd const& rhs) noexcept { - return mask_type( - _mm_cmpeq_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { + return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value)); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator>(simd const& lhs, simd const& rhs) noexcept { - return mask_type( - _mm_cmpgt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_mul_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_div_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_add_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sub_ps(lhs.m_value, rhs.m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { - return mask_type( - _mm_cmplt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_LT_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_GT_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { - return (lhs < rhs) || (lhs == rhs); + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_LE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { - return (lhs > rhs) || (lhs == rhs); + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_GE_OS)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type - operator!=(simd const& lhs, simd const& rhs) noexcept { - return !(lhs == rhs); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_sub_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); - } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); - } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, int rhs) noexcept { - return simd(_mm_srai_epi32(static_cast<__m128i>(lhs), rhs)); - } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_srav_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); - } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, int rhs) noexcept { - return simd(_mm_slli_epi32(static_cast<__m128i>(lhs), rhs)); + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_EQ_OS)); } - - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( - simd const& lhs, simd const& rhs) noexcept { - return simd( - _mm_sllv_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmp_ps(static_cast<__m256>(lhs), + static_cast<__m256>(rhs), _CMP_NEQ_OS)); } }; } // namespace Experimental -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> - abs(Experimental::simd< - std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { - __m128i const rhs = static_cast<__m128i>(a); +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +copysign( + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + a, + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + b) { + __m256 const sign_mask = _mm256_set1_ps(-0.0); + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_xor_ps(_mm256_andnot_ps(sign_mask, static_cast<__m256>(a)), + _mm256_and_ps(sign_mask, static_cast<__m256>(b)))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + abs(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + __m256 const sign_mask = _mm256_set1_ps(-0.0); + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_andnot_ps(sign_mask, static_cast<__m256>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + floor(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + ceil(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + round(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + trunc(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_round_ps(static_cast<__m256>(a), + (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + sqrt(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_sqrt_ps(static_cast<__m256>(a))); +} + +#ifdef __INTEL_COMPILER + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + cbrt(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_cbrt_ps(static_cast<__m256>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + exp(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_exp_ps(static_cast<__m256>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> + log(Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_log_ps(static_cast<__m256>(a))); +} + +#endif + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +fma(Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + a, + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + b, + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + c) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_fmadd_ps(static_cast<__m256>(a), static_cast<__m256>(b), + static_cast<__m256>(c))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +max(Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + a, + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + b) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_max_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx2_fixed_size<8>> +min(Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + a, + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> const& + b) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_min_ps(static_cast<__m256>(a), static_cast<__m256>(b))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd<float, simd_abi::avx2_fixed_size<8>> + condition(simd_mask<float, simd_abi::avx2_fixed_size<8>> const& a, + simd<float, simd_abi::avx2_fixed_size<8>> const& b, + simd<float, simd_abi::avx2_fixed_size<8>> const& c) { + return simd<float, simd_abi::avx2_fixed_size<8>>(_mm256_blendv_ps( + static_cast<__m256>(c), static_cast<__m256>(b), static_cast<__m256>(a))); +} + +template <> +class simd<std::int32_t, simd_abi::avx2_fixed_size<4>> { + __m128i m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::avx2_fixed_size<4>; + using mask_type = simd_mask<value_type, abi_type>; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm_set1_epi32(value_type(value))) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value(_mm_setr_epi32(gen(std::integral_constant<std::size_t, 0>()), + gen(std::integral_constant<std::size_t, 1>()), + gen(std::integral_constant<std::size_t, 2>()), + gen(std::integral_constant<std::size_t, 3>()))) { + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m128i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd<std::uint64_t, abi_type> const& other); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast<value_type*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast<value_type const*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + _mm_cmpeq_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + _mm_cmpgt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + _mm_cmplt_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return (lhs < rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return (lhs > rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_sub_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm_srai_epi32(static_cast<__m128i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_srav_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm_slli_epi32(static_cast<__m128i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_sllv_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + __m128i const rhs = static_cast<__m128i>(a); + return Experimental::simd<std::int32_t, + Experimental::simd_abi::avx2_fixed_size<4>>( + _mm_abs_epi32(rhs)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + round(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { + return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( + _mm256_cvtepi32_pd(static_cast<__m128i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd<std::int32_t, simd_abi::avx2_fixed_size<4>> + condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a, + simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& b, + simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& c) { + return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_castps_si128( + _mm_blendv_ps(_mm_castsi128_ps(static_cast<__m128i>(c)), + _mm_castsi128_ps(static_cast<__m128i>(b)), + _mm_castsi128_ps(static_cast<__m128i>(a))))); +} + +template <> +class simd<std::int32_t, simd_abi::avx2_fixed_size<8>> { + __m256i m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::avx2_fixed_size<8>; + using mask_type = simd_mask<value_type, abi_type>; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm256_set1_epi32(value_type(value))) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value( + _mm256_setr_epi32(gen(std::integral_constant<std::size_t, 0>()), + gen(std::integral_constant<std::size_t, 1>()), + gen(std::integral_constant<std::size_t, 2>()), + gen(std::integral_constant<std::size_t, 3>()), + gen(std::integral_constant<std::size_t, 4>()), + gen(std::integral_constant<std::size_t, 5>()), + gen(std::integral_constant<std::size_t, 6>()), + gen(std::integral_constant<std::size_t, 7>()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m256i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast<value_type*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast<value_type const*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi32(ptr, static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi32(ptr, static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_maskstore_epi32(ptr, static_cast<__m256i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi32(ptr, static_cast<__m256i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpeq_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm256_cmpgt_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return !(lhs >= rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return (lhs < rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return (lhs > rhs) || (lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm256_sub_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm256_add_epi32(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_srai_epi32(static_cast<__m256i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_srav_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm256_slli_epi32(static_cast<__m256i>(lhs), rhs)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm256_sllv_epi32(static_cast<__m256i>(lhs), + static_cast<__m256i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + __m256i const rhs = static_cast<__m256i>(a); return Experimental::simd<std::int32_t, - Experimental::simd_abi::avx2_fixed_size<4>>( - _mm_abs_epi32(rhs)); + Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_abs_epi32(rhs)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> floor(Experimental::simd< - std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { - return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( - _mm256_cvtepi32_pd(static_cast<__m128i>(a))); + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> ceil(Experimental::simd< - std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { - return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( - _mm256_cvtepi32_pd(static_cast<__m128i>(a))); + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> round(Experimental::simd< - std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { - return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( - _mm256_cvtepi32_pd(static_cast<__m128i>(a))); + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>> + Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>> trunc(Experimental::simd< - std::int32_t, Experimental::simd_abi::avx2_fixed_size<4>> const& a) { - return Experimental::simd<double, Experimental::simd_abi::avx2_fixed_size<4>>( - _mm256_cvtepi32_pd(static_cast<__m128i>(a))); + std::int32_t, Experimental::simd_abi::avx2_fixed_size<8>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::avx2_fixed_size<8>>( + _mm256_cvtepi32_ps(static_cast<__m256i>(a))); } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd<std::int32_t, simd_abi::avx2_fixed_size<4>> - condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> const& a, - simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& b, - simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& c) { - return simd<std::int32_t, simd_abi::avx2_fixed_size<4>>(_mm_castps_si128( - _mm_blendv_ps(_mm_castsi128_ps(static_cast<__m128i>(c)), - _mm_castsi128_ps(static_cast<__m128i>(b)), - _mm_castsi128_ps(static_cast<__m128i>(a))))); + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> + condition(simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>> const& a, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> const& b, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> const& c) { + return simd<std::int32_t, simd_abi::avx2_fixed_size<8>>(_mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(static_cast<__m256i>(c)), + _mm256_castsi256_ps(static_cast<__m256i>(b)), + _mm256_castsi256_ps(static_cast<__m256i>(a))))); } template <> @@ -1203,11 +1896,11 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> { using abi_type = simd_abi::avx2_fixed_size<4>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 4; } @@ -1249,6 +1942,15 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> { #else m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( @@ -1256,13 +1958,18 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> { _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), static_cast<__m256i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd( _mm256_sub_epi64(_mm256_set1_epi64x(0), static_cast<__m256i>(m_value))); } @@ -1278,6 +1985,13 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> { _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit signed integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + // AVX2 only has eq and gt comparisons for int64 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { @@ -1306,17 +2020,19 @@ class simd<std::int64_t, simd_abi::avx2_fixed_size<4>> { return !(lhs == rhs); } + // fallback simd shift right arithmetic using generator constructor // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, int rhs) noexcept { - // return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs)); - // } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs; }); + } - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, simd const& rhs) noexcept { - // return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs), - // static_cast<__m256i>(rhs)))); - // } + // fallback simd shift right arithmetic using generator constructor + // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { @@ -1397,11 +2113,11 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> { using abi_type = simd_abi::avx2_fixed_size<4>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 4; } @@ -1446,6 +2162,25 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> { static_cast<__m256i>(mask_type(true))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast<long long const*>(ptr), + static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1460,6 +2195,14 @@ class simd<std::uint64_t, simd_abi::avx2_fixed_size<4>> { return simd( _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit unsigned integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs); @@ -1588,6 +2331,11 @@ class const_where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>, static_cast<__m256d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)), + static_cast<__m256d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const { @@ -1624,6 +2372,11 @@ class where_expression<simd_mask<double, simd_abi::avx2_fixed_size<4>>, mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_pd( + mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) { @@ -1667,6 +2420,11 @@ class const_where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>, static_cast<__m128>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)), + static_cast<__m128>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) const { @@ -1703,6 +2461,11 @@ class where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>, _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type( + _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, simd<std::int32_t, simd_abi::avx2_fixed_size<4>> const& index) { @@ -1724,6 +2487,94 @@ class where_expression<simd_mask<float, simd_abi::avx2_fixed_size<4>>, } }; +template <> +class const_where_expression<simd_mask<float, simd_abi::avx2_fixed_size<8>>, + simd<float, simd_abi::avx2_fixed_size<8>>> { + public: + using abi_type = simd_abi::avx2_fixed_size<8>; + using value_type = simd<float, abi_type>; + using mask_type = simd_mask<float, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, element_aligned_tag) const { + _mm256_maskstore_ps(mem, _mm256_castps_si256(static_cast<__m256>(m_mask)), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_maskstore_ps(mem, _mm256_castps_si256(static_cast<__m256>(m_mask)), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + float* mem, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> const& index) const { + for (std::size_t lane = 0; lane < value_type::size(); ++lane) { + if (m_mask[lane]) mem[index[lane]] = m_value[lane]; + } + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression<simd_mask<float, simd_abi::avx2_fixed_size<8>>, + simd<float, simd_abi::avx2_fixed_size<8>>> + : public const_where_expression< + simd_mask<float, simd_abi::avx2_fixed_size<8>>, + simd<float, simd_abi::avx2_fixed_size<8>>> { + public: + where_expression( + simd_mask<float, simd_abi::avx2_fixed_size<8>> const& mask_arg, + simd<float, simd_abi::avx2_fixed_size<8>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, element_aligned_tag) { + m_value = value_type(_mm256_maskload_ps( + mem, _mm256_castps_si256(static_cast<__m256>(m_mask)))); + } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_ps( + mem, _mm256_castps_si256(static_cast<__m256>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + float const* mem, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> const& index) { + m_value = value_type(_mm256_mask_i32gather_ps( + static_cast<__m256>(m_value), mem, static_cast<__m256i>(index), + static_cast<__m256>(m_mask), 4)); + } + template <class U, + std::enable_if_t<std::is_convertible_v< + U, simd<float, simd_abi::avx2_fixed_size<8>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<float, simd_abi::avx2_fixed_size<8>>>( + std::forward<U>(x)); + m_value = simd<float, simd_abi::avx2_fixed_size<8>>(_mm256_blendv_ps( + static_cast<__m256>(m_value), static_cast<__m256>(x_as_value_type), + static_cast<__m256>(m_mask))); + } +}; + template <> class const_where_expression< simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>, @@ -1746,6 +2597,12 @@ class const_where_expression< _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), static_cast<__m128i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), + static_cast<__m128i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1786,6 +2643,16 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>, m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m128i tmp = _mm_load_si128(reinterpret_cast<__m128i const*>(mem)); + m_value = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask))); +#else + m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1810,6 +2677,109 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>>, } }; +template <> +class const_where_expression< + simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>>, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>>> { + public: + using abi_type = simd_abi::avx2_fixed_size<8>; + using value_type = simd<std::int32_t, abi_type>; + using mask_type = simd_mask<std::int32_t, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + _mm256_maskstore_epi32(mem, static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_maskstore_epi32(mem, static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::int32_t* mem, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> const& index) const { + for (std::size_t lane = 0; lane < value_type::size(); ++lane) { + if (m_mask[lane]) mem[index[lane]] = m_value[lane]; + } + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression<simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>>, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>>> + : public const_where_expression< + simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>>, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>>> { + public: + where_expression( + simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>> const& mask_arg, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = + value_type(_mm256_maskload_epi32(mem, static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = + value_type(_mm256_maskload_epi32(mem, static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::int32_t const* mem, + simd<std::int32_t, simd_abi::avx2_fixed_size<8>> const& index) { + m_value = value_type(_mm256_mask_i32gather_epi32( + static_cast<__m256i>(m_value), mem, static_cast<__m256i>(index), + static_cast<__m256i>(m_mask), 4)); + } + template < + class U, + std::enable_if_t<std::is_convertible_v< + U, simd<std::int32_t, simd_abi::avx2_fixed_size<8>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<std::int32_t, simd_abi::avx2_fixed_size<8>>>( + std::forward<U>(x)); + m_value = simd<std::int32_t, simd_abi::avx2_fixed_size<8>>( + _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(static_cast<__m256i>(m_value)), + _mm256_castsi256_ps(static_cast<__m256i>(x_as_value_type)), + _mm256_castsi256_ps(static_cast<__m256i>(m_mask))))); + } +}; + template <> class const_where_expression< simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>, @@ -1833,6 +2803,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::int64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1874,6 +2851,17 @@ class where_expression<simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>>, reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1922,6 +2910,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::uint64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1963,6 +2958,17 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::avx2_fixed_size<4>>, reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast<long long const*>(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp index c5d1717ad4ea765bf7648c68b1430cac7d3fae95..ce9ef8d0ff360701db2fc1e4bf8c40c6f34e9ec2 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp @@ -140,6 +140,122 @@ class simd_mask<T, simd_abi::avx512_fixed_size<8>> { } }; +template <class T> +class simd_mask<T, simd_abi::avx512_fixed_size<16>> { + __mmask16 m_value; + + public: + class reference { + __mmask16& m_mask; + int m_lane; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION __mmask16 bit_mask() const { + return __mmask16(std::int32_t(1 << m_lane)); + } + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(__mmask16& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + if (value) { + m_mask |= bit_mask(); + } else { + m_mask &= ~bit_mask(); + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + return (m_mask & bit_mask()) != 0; + } + }; + using value_type = bool; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) + : m_value(-std::int32_t(value)) {} + template <class U> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask<U, simd_abi::avx512_fixed_size<16>> const& other) + : m_value(static_cast<__mmask16>(other)) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(G&& gen) : m_value(false) { + reference(m_value, int(0)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 0>())); + reference(m_value, int(1)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 1>())); + reference(m_value, int(2)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 2>())); + reference(m_value, int(3)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 3>())); + reference(m_value, int(4)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 4>())); + reference(m_value, int(5)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 5>())); + reference(m_value, int(6)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 6>())); + reference(m_value, int(7)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 7>())); + reference(m_value, int(8)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 8>())); + reference(m_value, int(9)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 9>())); + reference(m_value, int(10)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 10>())); + reference(m_value, int(11)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 11>())); + reference(m_value, int(12)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 12>())); + reference(m_value, int(13)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 13>())); + reference(m_value, int(14)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 14>())); + reference(m_value, int(15)) = + static_cast<bool>(gen(std::integral_constant<std::size_t, 15>())); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 16; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + __mmask16 const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __mmask16() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + auto const bit_mask = __mmask16(std::int32_t(1 << i)); + return (m_value & bit_mask) != 0; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator||(simd_mask const& other) const { + return simd_mask(_kor_mask16(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask + operator&&(simd_mask const& other) const { + return simd_mask(_kand_mask16(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask operator!() const { + static const __mmask16 true_value(static_cast<__mmask16>(simd_mask(true))); + return simd_mask(_kxor_mask16(true_value, m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + simd_mask const& other) const { + return m_value == other.m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + simd_mask const& other) const { + return m_value != other.m_value; + } +}; + template <> class simd<double, simd_abi::avx512_fixed_size<8>> { __m512d m_value; @@ -149,11 +265,11 @@ class simd<double, simd_abi::avx512_fixed_size<8>> { using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 8; } @@ -193,16 +309,24 @@ class simd<double, simd_abi::avx512_fixed_size<8>> { element_aligned_tag) { m_value = _mm512_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(_mm512_sub_pd(_mm512_set1_pd(0.0), m_value)); } @@ -434,11 +558,11 @@ class simd<float, simd_abi::avx512_fixed_size<8>> { using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 8; } @@ -475,16 +599,24 @@ class simd<float, simd_abi::avx512_fixed_size<8>> { element_aligned_tag) { m_value = _mm256_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value)); } @@ -684,6 +816,280 @@ simd<float, simd_abi::avx512_fixed_size<8>> condition( static_cast<__m256>(b))); } +template <> +class simd<float, simd_abi::avx512_fixed_size<16>> { + __m512 m_value; + + public: + using value_type = float; + using abi_type = simd_abi::avx512_fixed_size<16>; + using mask_type = simd_mask<value_type, abi_type>; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 16; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm512_set1_ps(value_type(value))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m512 const& value_in) + : m_value(value_in) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) + : m_value( + _mm512_setr_ps(gen(std::integral_constant<std::size_t, 0>()), + gen(std::integral_constant<std::size_t, 1>()), + gen(std::integral_constant<std::size_t, 2>()), + gen(std::integral_constant<std::size_t, 3>()), + gen(std::integral_constant<std::size_t, 4>()), + gen(std::integral_constant<std::size_t, 5>()), + gen(std::integral_constant<std::size_t, 6>()), + gen(std::integral_constant<std::size_t, 7>()), + gen(std::integral_constant<std::size_t, 8>()), + gen(std::integral_constant<std::size_t, 9>()), + gen(std::integral_constant<std::size_t, 10>()), + gen(std::integral_constant<std::size_t, 11>()), + gen(std::integral_constant<std::size_t, 12>()), + gen(std::integral_constant<std::size_t, 13>()), + gen(std::integral_constant<std::size_t, 14>()), + gen(std::integral_constant<std::size_t, 15>()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast<value_type*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast<value_type const*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm512_loadu_ps(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_ps(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm512_storeu_ps(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_ps(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { + return simd(_mm512_sub_ps(_mm512_set1_ps(0.0), m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_mul_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_div_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_add_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_sub_ps(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LT_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GT_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_LE_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_GE_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_EQ_OS)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmp_ps_mask(lhs.m_value, rhs.m_value, _CMP_NEQ_OS)); + } +}; + +} // namespace Experimental + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> +copysign(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& b) { + __m512 const sign_mask = _mm512_set1_ps(-0.0); + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_xor_ps(_mm512_andnot_ps(sign_mask, static_cast<__m512>(a)), + _mm512_and_ps(sign_mask, static_cast<__m512>(b)))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> abs( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const sign_mask = _mm512_set1_ps(-0.0); + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_andnot_ps(sign_mask, static_cast<__m512>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> + floor(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_NEG_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> + ceil(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_POS_INF)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> + round(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_NEAREST_INT)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> + trunc(Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512 const val = static_cast<__m512>(a); + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_roundscale_ps(val, _MM_FROUND_TO_ZERO)); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> sqrt( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_sqrt_ps(static_cast<__m512>(a))); +} + +#ifdef __INTEL_COMPILER + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> cbrt( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cbrt_ps(static_cast<__m512>(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> exp( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_exp_ps(static_cast<__m512>(a))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> log( + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_log_ps(static_cast<__m512>(a))); +} + +#endif + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> fma( + Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>> const& a, + Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>> const& b, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& c) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_fmadd_ps(static_cast<__m512>(a), static_cast<__m512>(b), + static_cast<__m512>(c))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> max( + Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>> const& a, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& b) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_max_ps(static_cast<__m512>(a), static_cast<__m512>(b))); +} + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +Experimental::simd<float, Experimental::simd_abi::avx512_fixed_size<16>> min( + Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>> const& a, + Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> const& b) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_min_ps(static_cast<__m512>(a), static_cast<__m512>(b))); +} + +namespace Experimental { + +KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION +simd<float, simd_abi::avx512_fixed_size<16>> condition( + simd_mask<float, simd_abi::avx512_fixed_size<16>> const& a, + simd<float, simd_abi::avx512_fixed_size<16>> const& b, + simd<float, simd_abi::avx512_fixed_size<16>> const& c) { + return simd<float, simd_abi::avx512_fixed_size<16>>( + _mm512_mask_blend_ps(static_cast<__mmask16>(a), static_cast<__m512>(c), + static_cast<__m512>(b))); +} + template <> class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> { __m256i m_value; @@ -693,11 +1099,11 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> { using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 8; } @@ -735,23 +1141,33 @@ class simd<std::int32_t, simd_abi::avx512_fixed_size<8>> { operator[](std::size_t i) const { return reinterpret_cast<value_type const*>(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(_mm256_sub_epi32(_mm256_set1_epi32(0), m_value)); } @@ -882,29 +1298,245 @@ namespace Experimental { } template <> -class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> { - __m256i m_value; +class simd<std::int32_t, simd_abi::avx512_fixed_size<16>> { + __m512i m_value; public: - using value_type = std::uint32_t; - using abi_type = simd_abi::avx512_fixed_size<8>; + using value_type = std::int32_t; + using abi_type = simd_abi::avx512_fixed_size<16>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { - return 8; + return 16; } template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, bool> = false> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) - : m_value(_mm256_set1_epi32( - Kokkos::bit_cast<std::int32_t>(value_type(value)))) {} + : m_value(_mm512_set1_epi32(value_type(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - __m256i const& value_in) + __m512i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd<std::uint64_t, abi_type> const& other); + template <class G, + std::enable_if_t< + // basically, can you do { value_type r = + // gen(std::integral_constant<std::size_t, i>()); } + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value(_mm512_setr_epi32( + gen(std::integral_constant<std::size_t, 0>()), + gen(std::integral_constant<std::size_t, 1>()), + gen(std::integral_constant<std::size_t, 2>()), + gen(std::integral_constant<std::size_t, 3>()), + gen(std::integral_constant<std::size_t, 4>()), + gen(std::integral_constant<std::size_t, 5>()), + gen(std::integral_constant<std::size_t, 6>()), + gen(std::integral_constant<std::size_t, 7>()), + gen(std::integral_constant<std::size_t, 8>()), + gen(std::integral_constant<std::size_t, 9>()), + gen(std::integral_constant<std::size_t, 10>()), + gen(std::integral_constant<std::size_t, 11>()), + gen(std::integral_constant<std::size_t, 12>()), + gen(std::integral_constant<std::size_t, 13>()), + gen(std::integral_constant<std::size_t, 14>()), + gen(std::integral_constant<std::size_t, 15>()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast<value_type*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast<value_type const*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm512_mask_storeu_epi32(ptr, static_cast<__mmask16>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_mask_store_epi32(ptr, static_cast<__mmask16>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() + const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { + return simd(_mm512_sub_epi32(_mm512_set1_epi32(0), m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_mullo_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd<std::int32_t, simd_abi::avx512_fixed_size<16>>( + _mm512_add_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd<std::int32_t, simd_abi::avx512_fixed_size<16>>( + _mm512_sub_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmplt_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmplt_epi32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmple_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmple_epi32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmpeq_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmpneq_epi32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm512_srai_epi32(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_srav_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm512_slli_epi32(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_sllv_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> +abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + __m512i const rhs = static_cast<__m512i>(a); + return Experimental::simd<std::int32_t, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_abs_epi32(rhs)); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +round(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepi32_ps(static_cast<__m512i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> + condition(simd_mask<std::int32_t, simd_abi::avx512_fixed_size<16>> const& a, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& b, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& c) { + return simd<std::int32_t, simd_abi::avx512_fixed_size<16>>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(a), + static_cast<__m512i>(c), + static_cast<__m512i>(b))); +} + +template <> +class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> { + __m256i m_value; + + public: + using value_type = std::uint32_t; + using abi_type = simd_abi::avx512_fixed_size<8>; + using mask_type = simd_mask<value_type, abi_type>; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 8; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm256_set1_epi32( + Kokkos::bit_cast<std::int32_t>(value_type(value)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m256i const& value_in) : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& other) @@ -939,11 +1571,21 @@ class simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { m_value = _mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1035,42 +1677,253 @@ floor(Experimental::simd< } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + double, Experimental::simd_abi::avx512_fixed_size<8>> +ceil(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd<double, + Experimental::simd_abi::avx512_fixed_size<8>>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +round(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd<double, + Experimental::simd_abi::avx512_fixed_size<8>>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + double, Experimental::simd_abi::avx512_fixed_size<8>> +trunc(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { + return Experimental::simd<double, + Experimental::simd_abi::avx512_fixed_size<8>>( + _mm512_cvtepu32_pd(static_cast<__m256i>(a))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> + condition(simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a, + simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& b, + simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& c) { + return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>( + _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), + static_cast<__m256i>(b))); +} + +template <> +class simd<std::uint32_t, simd_abi::avx512_fixed_size<16>> { + __m512i m_value; + + public: + using value_type = std::uint32_t; + using abi_type = simd_abi::avx512_fixed_size<16>; + using mask_type = simd_mask<value_type, abi_type>; + using reference = value_type&; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 16; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(_mm512_set1_epi32( + Kokkos::bit_cast<std::int32_t>(value_type(value)))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + __m512i const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& other) + : m_value(static_cast<__m512i>(other)) {} + template <class G, + std::enable_if_t< + // basically, can you do { value_type r = + // gen(std::integral_constant<std::size_t, i>()); } + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept + : m_value(_mm512_setr_epi32( + gen(std::integral_constant<std::size_t, 0>()), + gen(std::integral_constant<std::size_t, 1>()), + gen(std::integral_constant<std::size_t, 2>()), + gen(std::integral_constant<std::size_t, 3>()), + gen(std::integral_constant<std::size_t, 4>()), + gen(std::integral_constant<std::size_t, 5>()), + gen(std::integral_constant<std::size_t, 6>()), + gen(std::integral_constant<std::size_t, 7>()), + gen(std::integral_constant<std::size_t, 8>()), + gen(std::integral_constant<std::size_t, 9>()), + gen(std::integral_constant<std::size_t, 10>()), + gen(std::integral_constant<std::size_t, 11>()), + gen(std::integral_constant<std::size_t, 12>()), + gen(std::integral_constant<std::size_t, 13>()), + gen(std::integral_constant<std::size_t, 14>()), + gen(std::integral_constant<std::size_t, 15>()))) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reinterpret_cast<value_type*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reinterpret_cast<value_type const*>(&m_value)[i]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + _mm512_mask_storeu_epi32(ptr, static_cast<__mmask16>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_mask_store_epi32(ptr, static_cast<__mmask16>(mask_type(true)), + m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() + const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_mullo_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm512_add_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm512_sub_epi32(static_cast<__m512i>(lhs), static_cast<__m512i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmplt_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmplt_epu32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmple_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmple_epu32_mask(static_cast<__m512i>(rhs), + static_cast<__m512i>(lhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmpeq_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(_mm512_cmpneq_epu32_mask(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(_mm512_srli_epi32(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_srlv_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd(_mm512_slli_epi32(static_cast<__m512i>(lhs), rhs)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd(_mm512_sllv_epi32(static_cast<__m512i>(lhs), + static_cast<__m512i>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> +abs(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> +floor(Experimental::simd< + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::avx512_fixed_size<16>> ceil(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd<double, - Experimental::simd_abi::avx512_fixed_size<8>>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + float, Experimental::simd_abi::avx512_fixed_size<16>> round(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd<double, - Experimental::simd_abi::avx512_fixed_size<8>>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< - double, Experimental::simd_abi::avx512_fixed_size<8>> + float, Experimental::simd_abi::avx512_fixed_size<16>> trunc(Experimental::simd< - std::uint32_t, Experimental::simd_abi::avx512_fixed_size<8>> const& a) { - return Experimental::simd<double, - Experimental::simd_abi::avx512_fixed_size<8>>( - _mm512_cvtepu32_pd(static_cast<__m256i>(a))); + std::uint32_t, Experimental::simd_abi::avx512_fixed_size<16>> const& a) { + return Experimental::simd<float, + Experimental::simd_abi::avx512_fixed_size<16>>( + _mm512_cvtepu32_ps(static_cast<__m512i>(a))); } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> - condition(simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& a, - simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& b, - simd<std::uint32_t, simd_abi::avx512_fixed_size<8>> const& c) { - return simd<std::uint32_t, simd_abi::avx512_fixed_size<8>>( - _mm256_mask_blend_epi32(static_cast<__mmask8>(a), static_cast<__m256i>(c), - static_cast<__m256i>(b))); + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>> + condition( + simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<16>> const& a, + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>> const& b, + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>> const& c) { + return simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(a), + static_cast<__m512i>(c), + static_cast<__m512i>(b))); } template <> @@ -1082,11 +1935,11 @@ class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> { using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 8; } @@ -1130,17 +1983,26 @@ class simd<std::int64_t, simd_abi::avx512_fixed_size<8>> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(_mm512_sub_epi64(_mm512_set1_epi64(0), m_value)); } @@ -1281,11 +2143,11 @@ class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> { using abi_type = simd_abi::avx512_fixed_size<8>; using mask_type = simd_mask<value_type, abi_type>; using reference = value_type&; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 8; } @@ -1331,10 +2193,19 @@ class simd<std::uint64_t, simd_abi::avx512_fixed_size<8>> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1505,6 +2376,11 @@ class const_where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>, static_cast<__m512d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm512_mask_store_pd(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const { @@ -1541,6 +2417,11 @@ class where_expression<simd_mask<double, simd_abi::avx512_fixed_size<8>>, _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_pd( + _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) { @@ -1584,6 +2465,11 @@ class const_where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>, static_cast<__m256>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_mask_store_ps(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) const { @@ -1619,6 +2505,10 @@ class where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>, m_value = value_type(_mm256_mask_loadu_ps( _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_ps( + _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1627,7 +2517,7 @@ class where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>, __m256 mask = _mm256_maskz_mov_ps(static_cast<__mmask8>(m_mask), on); m_value = value_type( _mm256_mask_i32gather_ps(static_cast<__m256>(m_value), mem, - static_cast<__m256i>(index), mask, 4)); + static_cast<__m256i>(index), mask, 4)); } template < class U, @@ -1644,6 +2534,95 @@ class where_expression<simd_mask<float, simd_abi::avx512_fixed_size<8>>, } }; +template <> +class const_where_expression<simd_mask<float, simd_abi::avx512_fixed_size<16>>, + simd<float, simd_abi::avx512_fixed_size<16>>> { + public: + using abi_type = simd_abi::avx512_fixed_size<16>; + using value_type = simd<float, abi_type>; + using mask_type = simd_mask<float, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, element_aligned_tag) const { + _mm512_mask_storeu_ps(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm512_mask_store_ps(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + float* mem, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& index) const { + _mm512_mask_i32scatter_ps(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), + static_cast<__m512>(m_value), 4); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression<simd_mask<float, simd_abi::avx512_fixed_size<16>>, + simd<float, simd_abi::avx512_fixed_size<16>>> + : public const_where_expression< + simd_mask<float, simd_abi::avx512_fixed_size<16>>, + simd<float, simd_abi::avx512_fixed_size<16>>> { + public: + where_expression( + simd_mask<float, simd_abi::avx512_fixed_size<16>> const& mask_arg, + simd<float, simd_abi::avx512_fixed_size<16>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, element_aligned_tag) { + m_value = value_type(_mm512_mask_loadu_ps( + _mm512_set1_ps(0.0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_ps( + _mm512_set1_ps(0.0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + float const* mem, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& index) { + m_value = value_type(_mm512_mask_i32gather_ps( + static_cast<__m512>(m_value), static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), mem, 4)); + } + template <class U, std::enable_if_t< + std::is_convertible_v< + U, simd<float, simd_abi::avx512_fixed_size<16>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<float, simd_abi::avx512_fixed_size<16>>>( + std::forward<U>(x)); + m_value = simd<float, simd_abi::avx512_fixed_size<16>>(_mm512_mask_blend_ps( + static_cast<__mmask16>(m_mask), static_cast<__m512>(m_value), + static_cast<__m512>(x_as_value_type))); + } +}; + template <> class const_where_expression< simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>, @@ -1666,6 +2645,12 @@ class const_where_expression< _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1702,6 +2687,11 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1710,6 +2700,7 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template <class U, std::enable_if_t< std::is_convertible_v< @@ -1726,6 +2717,98 @@ class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<8>>, } }; +template <> +class const_where_expression< + simd_mask<std::int32_t, simd_abi::avx512_fixed_size<16>>, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>>> { + public: + using abi_type = simd_abi::avx512_fixed_size<16>; + using value_type = simd<std::int32_t, abi_type>; + using mask_type = simd_mask<std::int32_t, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + _mm512_mask_storeu_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::int32_t* mem, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& index) const { + _mm512_mask_i32scatter_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), + static_cast<__m512i>(m_value), 4); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression<simd_mask<std::int32_t, simd_abi::avx512_fixed_size<16>>, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>>> + : public const_where_expression< + simd_mask<std::int32_t, simd_abi::avx512_fixed_size<16>>, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>>> { + public: + where_expression( + simd_mask<std::int32_t, simd_abi::avx512_fixed_size<16>> const& mask_arg, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { + m_value = value_type(_mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::int32_t const* mem, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& index) { + m_value = value_type(_mm512_mask_i32gather_epi32( + static_cast<__m512i>(m_value), static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), mem, 4)); + } + template <class U, + std::enable_if_t< + std::is_convertible_v< + U, simd<std::int32_t, simd_abi::avx512_fixed_size<16>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<std::int32_t, simd_abi::avx512_fixed_size<16>>>( + std::forward<U>(x)); + m_value = simd<std::int32_t, simd_abi::avx512_fixed_size<16>>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value), + static_cast<__m512i>(x_as_value_type))); + } +}; + template <> class const_where_expression< simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>, @@ -1748,6 +2831,12 @@ class const_where_expression< _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint32_t* mem, @@ -1784,6 +2873,12 @@ class where_expression<simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint32_t const* mem, @@ -1792,6 +2887,7 @@ class where_expression<simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template <class U, std::enable_if_t< std::is_convertible_v< @@ -1808,6 +2904,99 @@ class where_expression<simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<8>>, } }; +template <> +class const_where_expression< + simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<16>>, + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>> { + public: + using abi_type = simd_abi::avx512_fixed_size<16>; + using value_type = simd<std::uint32_t, abi_type>; + using mask_type = simd_mask<std::uint32_t, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, element_aligned_tag) const { + _mm512_mask_storeu_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::uint32_t* mem, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& index) const { + _mm512_mask_i32scatter_epi32(mem, static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), + static_cast<__m512i>(m_value), 4); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression< + simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<16>>, + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>> + : public const_where_expression< + simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<16>>, + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>> { + public: + where_expression( + simd_mask<std::uint32_t, simd_abi::avx512_fixed_size<16>> const& mask_arg, + simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, element_aligned_tag) { + m_value = value_type(_mm512_mask_loadu_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi32( + _mm512_set1_epi32(0), static_cast<__mmask16>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::uint32_t const* mem, + simd<std::int32_t, simd_abi::avx512_fixed_size<16>> const& index) { + m_value = value_type(_mm512_mask_i32gather_epi32( + static_cast<__m512i>(m_value), static_cast<__mmask16>(m_mask), + static_cast<__m512i>(index), mem, 4)); + } + template <class U, + std::enable_if_t< + std::is_convertible_v< + U, simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>>( + std::forward<U>(x)); + m_value = simd<std::uint32_t, simd_abi::avx512_fixed_size<16>>( + _mm512_mask_blend_epi32(static_cast<__mmask16>(m_mask), + static_cast<__m512i>(m_value), + static_cast<__m512i>(x_as_value_type))); + } +}; + template <> class const_where_expression< simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>, @@ -1830,6 +3019,12 @@ class const_where_expression< _mm512_mask_storeu_epi64(mem, static_cast<__mmask8>(m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1866,6 +3061,12 @@ class where_expression<simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>, m_value = value_type(_mm512_mask_loadu_epi64( _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1874,6 +3075,7 @@ class where_expression<simd_mask<std::int64_t, simd_abi::avx512_fixed_size<8>>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template <class U, std::enable_if_t< std::is_convertible_v< @@ -1912,6 +3114,12 @@ class const_where_expression< _mm512_mask_storeu_epi64(mem, static_cast<__mmask8>(m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1949,6 +3157,11 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>, _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, simd<std::int32_t, simd_abi::avx512_fixed_size<8>> const& index) { @@ -1956,6 +3169,7 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::avx512_fixed_size<8>>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template <class U, std::enable_if_t< std::is_convertible_v< diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp index 87edf994533df8c79cae87bf3e38e560d0fec84e..c39749b8b8447722cc86a3d0d70c75d4fb96c555 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_Common.hpp @@ -31,7 +31,16 @@ class simd; template <class T, class Abi> class simd_mask; -struct element_aligned_tag {}; +class simd_alignment_vector_aligned {}; + +template <typename... Flags> +struct simd_flags {}; + +inline constexpr simd_flags<> simd_flag_default{}; +inline constexpr simd_flags<simd_alignment_vector_aligned> simd_flag_aligned{}; + +using element_aligned_tag = simd_flags<>; +using vector_aligned_tag = simd_flags<simd_alignment_vector_aligned>; // class template declarations for const_where_expression and where_expression @@ -117,48 +126,6 @@ template <class T> return const_where_expression(mask, value); } -// fallback simd multiplication using generator constructor -// At the time of this writing, this fallback is only used -// to multiply vectors of 64-bit signed integers for the AVX2 backend - -template <class T, class Abi> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator*( - simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) { - return simd<T, Abi>([&](std::size_t i) { return lhs[i] * rhs[i]; }); -} - -// fallback simd shift using generator constructor -// At the time of this edit, only the fallback for shift vectors of -// 64-bit signed integers for the AVX2 backend is used - -template <typename T, typename Abi, - typename = std::enable_if_t<std::is_integral_v<T>>> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator>>( - simd<T, Abi> const& lhs, int rhs) { - return simd<T, Abi>([&](std::size_t i) { return lhs[i] >> rhs; }); -} - -template <typename T, typename Abi, - typename = std::enable_if_t<std::is_integral_v<T>>> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator<<( - simd<T, Abi> const& lhs, int rhs) { - return simd<T, Abi>([&](std::size_t i) { return lhs[i] << rhs; }); -} - -template <typename T, typename Abi, - typename = std::enable_if_t<std::is_integral_v<T>>> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator>>( - simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) { - return simd<T, Abi>([&](std::size_t i) { return lhs[i] >> rhs[i]; }); -} - -template <typename T, typename Abi, - typename = std::enable_if_t<std::is_integral_v<T>>> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd<T, Abi> operator<<( - simd<T, Abi> const& lhs, simd<T, Abi> const& rhs) { - return simd<T, Abi>([&](std::size_t i) { return lhs[i] << rhs[i]; }); -} - // The code below provides: // operator@(simd<T, Abi>, Arithmetic) // operator@(Arithmetic, simd<T, Abi>) diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp index 43ece2038903e1139485f14973f8816c3e2f24b1..6219cd8547fee8b2533dd3beecd623ad264736cf 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp @@ -42,11 +42,11 @@ class neon_fixed_size {}; namespace Impl { -template <class Derived, int Bits> +template <class Derived, int Bits, int Size> class neon_mask; template <class Derived> -class neon_mask<Derived, 64> { +class neon_mask<Derived, 64, 2> { uint64x2_t m_value; public: @@ -104,12 +104,13 @@ class neon_mask<Derived, 64> { } template <class U> KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( - neon_mask<U, 32> const& other) { + neon_mask<U, 32, 2> const& other) { operator[](0) = bool(other[0]); operator[](1) = bool(other[1]); } template <class U> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask<U, 64> const& other) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask<U, 64, 2> const& other) : neon_mask(static_cast<uint64x2_t>(other)) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; @@ -158,7 +159,7 @@ class neon_mask<Derived, 64> { }; template <class Derived> -class neon_mask<Derived, 32> { +class neon_mask<Derived, 32, 2> { uint32x2_t m_value; public: @@ -211,10 +212,12 @@ class neon_mask<Derived, 32> { m_value, 1); } template <class U> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask<U, 64> const& other) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask<U, 64, 2> const& other) : m_value(vqmovn_u64(static_cast<uint64x2_t>(other))) {} template <class U> - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask(neon_mask<U, 32> const& other) + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask( + neon_mask<U, 32, 2> const& other) : m_value(static_cast<uint32x2_t>(other)) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; @@ -260,14 +263,125 @@ class neon_mask<Derived, 32> { } }; +template <class Derived> +class neon_mask<Derived, 32, 4> { + uint32x4_t m_value; + + public: + class reference { + uint32x4_t& m_mask; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(uint32x4_t& mask_arg, + int lane_arg) + : m_mask(mask_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(bool value) const { + switch (m_lane) { + case 0: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 0); + break; + case 1: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 1); + break; + case 2: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 2); + break; + case 3: + m_mask = vsetq_lane_u32(value ? 0xFFFFFFFFU : 0, m_mask, 3); + break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator bool() const { + switch (m_lane) { + case 0: return vgetq_lane_u32(m_mask, 0) != 0; + case 1: return vgetq_lane_u32(m_mask, 1) != 0; + case 2: return vgetq_lane_u32(m_mask, 2) != 0; + case 3: return vgetq_lane_u32(m_mask, 3) != 0; + } + return false; + } + }; + using value_type = bool; + using abi_type = simd_abi::neon_fixed_size<4>; + using implementation_type = uint32x4_t; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION neon_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit neon_mask(value_type value) + : m_value(vmovq_n_u32(value ? 0xFFFFFFFFU : 0)) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask( + G&& gen) noexcept { + m_value = vsetq_lane_u32( + (gen(std::integral_constant<std::size_t, 0>()) ? 0xFFFFFFFFU : 0), + m_value, 0); + m_value = vsetq_lane_u32( + (gen(std::integral_constant<std::size_t, 1>()) ? 0xFFFFFFFFU : 0), + m_value, 1); + m_value = vsetq_lane_u32( + (gen(std::integral_constant<std::size_t, 2>()) ? 0xFFFFFFFFU : 0), + m_value, 2); + m_value = vsetq_lane_u32( + (gen(std::integral_constant<std::size_t, 3>()) ? 0xFFFFFFFFU : 0), + m_value, 3); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit neon_mask( + uint32x4_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint32x4_t() + const { + return m_value; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return static_cast<value_type>( + reference(const_cast<uint32x4_t&>(m_value), int(i))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator||(neon_mask const& other) const { + return Derived(vorrq_u32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived + operator&&(neon_mask const& other) const { + return Derived(vandq_u32(m_value, other.m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Derived operator!() const { + auto const true_value = static_cast<uint32x4_t>(neon_mask(true)); + return Derived(veorq_u32(m_value, true_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator==( + neon_mask const& other) const { + uint32x4_t const elementwise_equality = vceqq_u32(m_value, other.m_value); + uint64x2_t const overall_equality_neon = + vreinterpretq_u64_u32(elementwise_equality); + return (overall_equality_neon[0] == 0xFFFFFFFFFFFFFFFFULL) && + (overall_equality_neon[1] == 0xFFFFFFFFFFFFFFFFULL); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION bool operator!=( + neon_mask const& other) const { + return !operator==(other); + } +}; + } // namespace Impl template <class T> class simd_mask<T, simd_abi::neon_fixed_size<2>> : public Impl::neon_mask<simd_mask<T, simd_abi::neon_fixed_size<2>>, - sizeof(T) * 8> { + sizeof(T) * 8, 2> { using base_type = Impl::neon_mask<simd_mask<T, simd_abi::neon_fixed_size<2>>, - sizeof(T) * 8>; + sizeof(T) * 8, 2>; public: using implementation_type = typename base_type::implementation_type; @@ -291,6 +405,35 @@ class simd_mask<T, simd_abi::neon_fixed_size<2>> : base_type(gen) {} }; +template <class T> +class simd_mask<T, simd_abi::neon_fixed_size<4>> + : public Impl::neon_mask<simd_mask<T, simd_abi::neon_fixed_size<4>>, + sizeof(T) * 8, 4> { + using base_type = Impl::neon_mask<simd_mask<T, simd_abi::neon_fixed_size<4>>, + sizeof(T) * 8, 4>; + + public: + using implementation_type = typename base_type::implementation_type; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(bool value) + : base_type(value) {} + template <class U> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask( + simd_mask<U, simd_abi::neon_fixed_size<4>> const& other) + : base_type(other) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + implementation_type const& value) + : base_type(value) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<typename base_type::value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd_mask( + G&& gen) noexcept + : base_type(gen) {} +}; + template <> class simd<double, simd_abi::neon_fixed_size<2>> { float64x2_t m_value; @@ -323,11 +466,11 @@ class simd<double, simd_abi::neon_fixed_size<2>> { return 0; } }; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; } @@ -363,16 +506,24 @@ class simd<double, simd_abi::neon_fixed_size<2>> { element_aligned_tag) { m_value = vld1q_f64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_f64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float64x2_t() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(vnegq_f64(m_value)); } @@ -570,11 +721,11 @@ class simd<float, simd_abi::neon_fixed_size<2>> { return 0; } }; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; } @@ -607,16 +758,24 @@ class simd<float, simd_abi::neon_fixed_size<2>> { element_aligned_tag) { m_value = vld1_f32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_f32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_f32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_f32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float32x2_t() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(vneg_f32(m_value)); } @@ -772,6 +931,256 @@ namespace Experimental { static_cast<float32x2_t>(c))); } +template <> +class simd<float, simd_abi::neon_fixed_size<4>> { + float32x4_t m_value; + + public: + using value_type = float; + using abi_type = simd_abi::neon_fixed_size<4>; + using mask_type = simd_mask<value_type, abi_type>; + class reference { + float32x4_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(float32x4_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(float value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_f32(value, m_value, 0); break; + case 1: m_value = vsetq_lane_f32(value, m_value, 1); break; + case 2: m_value = vsetq_lane_f32(value, m_value, 2); break; + case 3: m_value = vsetq_lane_f32(value, m_value, 3); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator float() const { + switch (m_lane) { + case 0: return vgetq_lane_f32(m_value, 0); + case 1: return vgetq_lane_f32(m_value, 1); + case 2: return vgetq_lane_f32(m_value, 2); + case 3: return vgetq_lane_f32(m_value, 3); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_f32(value_type(value))) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(G&& gen) { + m_value = vsetq_lane_f32(gen(std::integral_constant<std::size_t, 0>()), + m_value, 0); + m_value = vsetq_lane_f32(gen(std::integral_constant<std::size_t, 1>()), + m_value, 1); + m_value = vsetq_lane_f32(gen(std::integral_constant<std::size_t, 2>()), + m_value, 2); + m_value = vsetq_lane_f32(gen(std::integral_constant<std::size_t, 3>()), + m_value, 3); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + float32x4_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast<simd*>(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1q_f32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_f32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit + operator float32x4_t() const { + return m_value; + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { + return simd(vnegq_f32(m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd(vmulq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator/( + simd const& lhs, simd const& rhs) noexcept { + return simd(vdivq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd(vaddq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd(vsubq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcltq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcgtq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcleq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vcgeq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type(vceqq_f32(lhs.m_value, rhs.m_value)); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> + abs(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vabsq_f32(static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> + floor(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vrndmq_f32(static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> + ceil(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vrndpq_f32(static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> + round(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vrndxq_f32(static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> + trunc(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vrndq_f32(static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +copysign( + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + a, + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + b) { + uint32x4_t const sign_mask = vreinterpretq_u32_f32(vmovq_n_f32(-0.0)); + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vreinterpretq_f32_u32(vorrq_u32( + vreinterpretq_u32_f32(static_cast<float32x4_t>(abs(a))), + vandq_u32(sign_mask, + vreinterpretq_u32_f32(static_cast<float32x4_t>(b)))))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> + sqrt(Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> const& a) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vsqrtq_f32(static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +fma(Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + a, + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + b, + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + c) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vfmaq_f32(static_cast<float32x4_t>(c), static_cast<float32x4_t>(b), + static_cast<float32x4_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +max(Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + a, + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + b) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vmaxq_f32(static_cast<float32x4_t>(a), static_cast<float32x4_t>(b))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION Experimental::simd< + float, Experimental::simd_abi::neon_fixed_size<4>> +min(Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + a, + Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>> const& + b) { + return Experimental::simd<float, Experimental::simd_abi::neon_fixed_size<4>>( + vminq_f32(static_cast<float32x4_t>(a), static_cast<float32x4_t>(b))); +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd<float, simd_abi::neon_fixed_size<4>> + condition(simd_mask<float, simd_abi::neon_fixed_size<4>> const& a, + simd<float, simd_abi::neon_fixed_size<4>> const& b, + simd<float, simd_abi::neon_fixed_size<4>> const& c) { + return simd<float, simd_abi::neon_fixed_size<4>>( + vbslq_f32(static_cast<uint32x4_t>(a), static_cast<float32x4_t>(b), + static_cast<float32x4_t>(c))); +} + template <> class simd<std::int32_t, simd_abi::neon_fixed_size<2>> { int32x2_t m_value; @@ -804,11 +1213,11 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> { return 0; } }; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; } @@ -829,7 +1238,227 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> { m_value, 1); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( - int32x2_t const& value_in) + int32x2_t const& value_in) + : m_value(value_in) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( + simd<std::uint64_t, abi_type> const& other); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { + return reference(m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type + operator[](std::size_t i) const { + return reference(const_cast<simd*>(this)->m_value, int(i)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = vld1_s32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_s32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1_s32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_s32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() + const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { + return simd(vneg_s32(m_value)); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vsub_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vadd_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmul_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator==(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vceq_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcgt_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vclt_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator<=(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcle_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator>=(simd const& lhs, simd const& rhs) noexcept { + return mask_type( + vcge_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type + operator!=(simd const& lhs, simd const& rhs) noexcept { + return !(lhs == rhs); + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd(vshl_s32(static_cast<int32x2_t>(lhs), + vneg_s32(vmov_n_s32(std::int32_t(rhs))))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd(vshl_s32(static_cast<int32x2_t>(lhs), + vneg_s32(static_cast<int32x2_t>(rhs)))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, int rhs) noexcept { + return simd( + vshl_s32(static_cast<int32x2_t>(lhs), vmov_n_s32(std::int32_t(rhs)))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vshl_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + } +}; + +} // namespace Experimental + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + abs(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return Experimental::simd<std::int32_t, + Experimental::simd_abi::neon_fixed_size<2>>( + vabs_s32(static_cast<int32x2_t>(a))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + floor(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + ceil(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + round(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + trunc(Experimental::simd< + std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + return a; +} + +namespace Experimental { + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + simd<std::int32_t, simd_abi::neon_fixed_size<2>> + condition(simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>> const& a, + simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& b, + simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& c) { + return simd<std::int32_t, simd_abi::neon_fixed_size<2>>( + vbsl_s32(static_cast<uint32x2_t>(a), static_cast<int32x2_t>(b), + static_cast<int32x2_t>(c))); +} + +template <> +class simd<std::int32_t, simd_abi::neon_fixed_size<4>> { + int32x4_t m_value; + + public: + using value_type = std::int32_t; + using abi_type = simd_abi::neon_fixed_size<4>; + using mask_type = simd_mask<value_type, abi_type>; + class reference { + int32x4_t& m_value; + int m_lane; + + public: + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference(int32x4_t& value_arg, + int lane_arg) + : m_value(value_arg), m_lane(lane_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference + operator=(std::int32_t value) const { + switch (m_lane) { + case 0: m_value = vsetq_lane_s32(value, m_value, 0); break; + case 1: m_value = vsetq_lane_s32(value, m_value, 1); break; + case 2: m_value = vsetq_lane_s32(value, m_value, 2); break; + case 3: m_value = vsetq_lane_s32(value, m_value, 3); break; + } + return *this; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION operator std::int32_t() const { + switch (m_lane) { + case 0: return vgetq_lane_s32(m_value, 0); + case 1: return vgetq_lane_s32(m_value, 1); + case 2: return vgetq_lane_s32(m_value, 2); + case 3: return vgetq_lane_s32(m_value, 3); + } + return 0; + } + }; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { + return 4; + } + template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(U&& value) + : m_value(vmovq_n_s32(value_type(value))) {} + template <class G, + std::enable_if_t< + std::is_invocable_r_v<value_type, G, + std::integral_constant<std::size_t, 0>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + G&& gen) noexcept { + m_value = vsetq_lane_s32(gen(std::integral_constant<std::size_t, 0>()), + m_value, 0); + m_value = vsetq_lane_s32(gen(std::integral_constant<std::size_t, 1>()), + m_value, 1); + m_value = vsetq_lane_s32(gen(std::integral_constant<std::size_t, 2>()), + m_value, 2); + m_value = vsetq_lane_s32(gen(std::integral_constant<std::size_t, 3>()), + m_value, 3); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + int32x4_t const& value_in) : m_value(value_in) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd( simd<std::uint64_t, abi_type> const& other); @@ -842,57 +1471,69 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - m_value = vld1_s32(ptr); + m_value = vld1q_s32(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s32(ptr); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { - vst1_s32(ptr, m_value); + vst1q_s32(ptr, m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s32(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x4_t() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { - return simd(vneg_s32(m_value)); + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { + return simd(vnegq_s32(m_value)); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator-( simd const& lhs, simd const& rhs) noexcept { return simd( - vsub_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vsubq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator+( simd const& lhs, simd const& rhs) noexcept { return simd( - vadd_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vaddq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmulq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( - vceq_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vceqq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>(simd const& lhs, simd const& rhs) noexcept { return mask_type( - vcgt_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vcgtq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<(simd const& lhs, simd const& rhs) noexcept { return mask_type( - vclt_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vcltq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator<=(simd const& lhs, simd const& rhs) noexcept { return mask_type( - vcle_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vcleq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator>=(simd const& lhs, simd const& rhs) noexcept { return mask_type( - vcge_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vcgeq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator!=(simd const& lhs, simd const& rhs) noexcept { @@ -901,75 +1542,75 @@ class simd<std::int32_t, simd_abi::neon_fixed_size<2>> { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { - return simd(vshl_s32(static_cast<int32x2_t>(lhs), - vneg_s32(vmov_n_s32(std::int32_t(rhs))))); + return simd(vshlq_s32(static_cast<int32x4_t>(lhs), + vnegq_s32(vmovq_n_s32(std::int32_t(rhs))))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, simd const& rhs) noexcept { - return simd(vshl_s32(static_cast<int32x2_t>(lhs), - vneg_s32(static_cast<int32x2_t>(rhs)))); + return simd(vshlq_s32(static_cast<int32x4_t>(lhs), + vnegq_s32(static_cast<int32x4_t>(rhs)))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { return simd( - vshl_s32(static_cast<int32x2_t>(lhs), vmov_n_s32(std::int32_t(rhs)))); + vshlq_s32(static_cast<int32x4_t>(lhs), vmovq_n_s32(std::int32_t(rhs)))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, simd const& rhs) noexcept { return simd( - vshl_s32(static_cast<int32x2_t>(lhs), static_cast<int32x2_t>(rhs))); + vshlq_s32(static_cast<int32x4_t>(lhs), static_cast<int32x4_t>(rhs))); } }; } // namespace Experimental [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> abs(Experimental::simd< - std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { return Experimental::simd<std::int32_t, - Experimental::simd_abi::neon_fixed_size<2>>( - vabs_s32(static_cast<int32x2_t>(a))); + Experimental::simd_abi::neon_fixed_size<4>>( + vabsq_s32(static_cast<int32x4_t>(a))); } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> floor(Experimental::simd< - std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { return a; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> ceil(Experimental::simd< - std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { return a; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> round(Experimental::simd< - std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { return a; } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> + Experimental::simd<std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> trunc(Experimental::simd< - std::int32_t, Experimental::simd_abi::neon_fixed_size<2>> const& a) { + std::int32_t, Experimental::simd_abi::neon_fixed_size<4>> const& a) { return a; } namespace Experimental { [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION - simd<std::int32_t, simd_abi::neon_fixed_size<2>> - condition(simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>> const& a, - simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& b, - simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& c) { - return simd<std::int32_t, simd_abi::neon_fixed_size<2>>( - vbsl_s32(static_cast<uint32x2_t>(a), static_cast<int32x2_t>(b), - static_cast<int32x2_t>(c))); + simd<std::int32_t, simd_abi::neon_fixed_size<4>> + condition(simd_mask<std::int32_t, simd_abi::neon_fixed_size<4>> const& a, + simd<std::int32_t, simd_abi::neon_fixed_size<4>> const& b, + simd<std::int32_t, simd_abi::neon_fixed_size<4>> const& c) { + return simd<std::int32_t, simd_abi::neon_fixed_size<4>>( + vbslq_s32(static_cast<uint32x4_t>(a), static_cast<int32x4_t>(b), + static_cast<int32x4_t>(c))); } template <> @@ -1004,11 +1645,11 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> { return 0; } }; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; } @@ -1044,17 +1685,25 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> { element_aligned_tag) { m_value = vld1q_s64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_s64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator-() const - noexcept { + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd + operator-() const noexcept { return simd(vnegq_s64(m_value)); } @@ -1068,7 +1717,10 @@ class simd<std::int64_t, simd_abi::neon_fixed_size<2>> { return simd( vaddq_s64(static_cast<int64x2_t>(lhs), static_cast<int64x2_t>(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1204,11 +1856,11 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> { return 0; } }; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd() = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd const&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd& operator=(simd&&) = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 2; } @@ -1246,6 +1898,18 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> { element_aligned_tag) { m_value = vld1q_u64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_u64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_u64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_u64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() const { return m_value; @@ -1261,7 +1925,10 @@ class simd<std::uint64_t, simd_abi::neon_fixed_size<2>> { return simd( vaddq_u64(static_cast<uint64x2_t>(lhs), static_cast<uint64x2_t>(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( simd const& lhs, simd const& rhs) noexcept { return simd( @@ -1386,6 +2053,11 @@ class const_where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const { @@ -1421,6 +2093,11 @@ class where_expression<simd_mask<double, simd_abi::neon_fixed_size<2>>, if (m_mask[1]) m_value[1] = mem[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) { @@ -1464,6 +2141,11 @@ class const_where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd<std::int32_t, simd_abi::neon_fixed_size<2>> const& index) const { @@ -1498,6 +2180,10 @@ class where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1520,6 +2206,106 @@ class where_expression<simd_mask<float, simd_abi::neon_fixed_size<2>>, } }; +template <> +class const_where_expression<simd_mask<float, simd_abi::neon_fixed_size<4>>, + simd<float, simd_abi::neon_fixed_size<4>>> { + public: + using abi_type = simd_abi::neon_fixed_size<4>; + using value_type = simd<float, abi_type>; + using mask_type = simd_mask<float, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, element_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + float* mem, + simd<std::int32_t, simd_abi::neon_fixed_size<4>> const& index) const { + if (m_mask[0]) mem[index[0]] = m_value[0]; + if (m_mask[1]) mem[index[1]] = m_value[1]; + if (m_mask[2]) mem[index[2]] = m_value[2]; + if (m_mask[3]) mem[index[3]] = m_value[3]; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression<simd_mask<float, simd_abi::neon_fixed_size<4>>, + simd<float, simd_abi::neon_fixed_size<4>>> + : public const_where_expression< + simd_mask<float, simd_abi::neon_fixed_size<4>>, + simd<float, simd_abi::neon_fixed_size<4>>> { + public: + where_expression( + simd_mask<float, simd_abi::neon_fixed_size<4>> const& mask_arg, + simd<float, simd_abi::neon_fixed_size<4>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, element_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + float const* mem, + simd<std::int32_t, simd_abi::neon_fixed_size<4>> const& index) { + if (m_mask[0]) m_value[0] = mem[index[0]]; + if (m_mask[1]) m_value[1] = mem[index[1]]; + if (m_mask[2]) m_value[2] = mem[index[2]]; + if (m_mask[3]) m_value[3] = mem[index[3]]; + } + template <class U, + std::enable_if_t<std::is_convertible_v< + U, simd<float, simd_abi::neon_fixed_size<4>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<float, simd_abi::neon_fixed_size<4>>>( + std::forward<U>(x)); + m_value = static_cast<simd<float, simd_abi::neon_fixed_size<4>>>( + vbslq_f32(static_cast<uint32x4_t>(m_mask), + static_cast<float32x4_t>(x_as_value_type), + static_cast<float32x4_t>(m_value))); + } +}; + template <> class const_where_expression< simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>, @@ -1542,6 +2328,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1577,6 +2369,12 @@ class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1584,6 +2382,7 @@ class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t< @@ -1600,6 +2399,108 @@ class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<2>>, } }; +template <> +class const_where_expression< + simd_mask<std::int32_t, simd_abi::neon_fixed_size<4>>, + simd<std::int32_t, simd_abi::neon_fixed_size<4>>> { + public: + using abi_type = simd_abi::neon_fixed_size<4>; + using value_type = simd<std::int32_t, abi_type>; + using mask_type = simd_mask<std::int32_t, abi_type>; + + protected: + value_type& m_value; + mask_type const& m_mask; + + public: + const_where_expression(mask_type const& mask_arg, value_type const& value_arg) + : m_value(const_cast<value_type&>(value_arg)), m_mask(mask_arg) {} + + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, element_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + if (m_mask[2]) mem[2] = m_value[2]; + if (m_mask[3]) mem[3] = m_value[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void scatter_to( + std::int32_t* mem, + simd<std::int32_t, simd_abi::neon_fixed_size<4>> const& index) const { + if (m_mask[0]) mem[index[0]] = m_value[0]; + if (m_mask[1]) mem[index[1]] = m_value[1]; + if (m_mask[2]) mem[index[2]] = m_value[2]; + if (m_mask[3]) mem[index[3]] = m_value[3]; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& + impl_get_value() const { + return m_value; + } + + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& + impl_get_mask() const { + return m_mask; + } +}; + +template <> +class where_expression<simd_mask<std::int32_t, simd_abi::neon_fixed_size<4>>, + simd<std::int32_t, simd_abi::neon_fixed_size<4>>> + : public const_where_expression< + simd_mask<std::int32_t, simd_abi::neon_fixed_size<4>>, + simd<std::int32_t, simd_abi::neon_fixed_size<4>>> { + public: + where_expression( + simd_mask<std::int32_t, simd_abi::neon_fixed_size<4>> const& mask_arg, + simd<std::int32_t, simd_abi::neon_fixed_size<4>>& value_arg) + : const_where_expression(mask_arg, value_arg) {} + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, element_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + if (m_mask[2]) m_value[2] = mem[2]; + if (m_mask[3]) m_value[3] = mem[3]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void gather_from( + std::int32_t const* mem, + simd<std::int32_t, simd_abi::neon_fixed_size<4>> const& index) { + if (m_mask[0]) m_value[0] = mem[index[0]]; + if (m_mask[1]) m_value[1] = mem[index[1]]; + if (m_mask[2]) m_value[2] = mem[index[2]]; + if (m_mask[3]) m_value[3] = mem[index[3]]; + } + template < + class U, + std::enable_if_t< + std::is_convertible_v<U, simd<int32_t, simd_abi::neon_fixed_size<4>>>, + bool> = false> + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void operator=(U&& x) { + auto const x_as_value_type = + static_cast<simd<int32_t, simd_abi::neon_fixed_size<4>>>( + std::forward<U>(x)); + m_value = static_cast<simd<int32_t, simd_abi::neon_fixed_size<4>>>( + vbslq_s32(static_cast<uint32x4_t>(m_mask), + static_cast<int32x4_t>(x_as_value_type), + static_cast<int32x4_t>(m_value))); + } +}; + template <> class const_where_expression< simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>, @@ -1622,6 +2523,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1657,6 +2564,12 @@ class where_expression<simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1664,6 +2577,7 @@ class where_expression<simd_mask<std::int64_t, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t<std::is_convertible_v< @@ -1702,6 +2616,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint64_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1737,6 +2657,12 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, @@ -1744,6 +2670,7 @@ class where_expression<simd_mask<std::uint64_t, simd_abi::neon_fixed_size<2>>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template <class U, std::enable_if_t< std::is_convertible_v< diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp index af7cb1e2c61689ba6d3b9df83f642d482e533b34..61c34a1c76305d891009ad1e73fcd47650748f30 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp @@ -102,7 +102,7 @@ class simd<T, simd_abi::scalar> { KOKKOS_DEFAULTED_FUNCTION simd(simd const&) = default; KOKKOS_DEFAULTED_FUNCTION simd(simd&&) = default; KOKKOS_DEFAULTED_FUNCTION simd& operator=(simd const&) = default; - KOKKOS_DEFAULTED_FUNCTION simd& operator=(simd&&) = default; + KOKKOS_DEFAULTED_FUNCTION simd& operator=(simd&&) = default; KOKKOS_FORCEINLINE_FUNCTION static constexpr std::size_t size() { return 1; } template <class U, std::enable_if_t<std::is_convertible_v<U, value_type>, bool> = false> @@ -127,9 +127,16 @@ class simd<T, simd_abi::scalar> { element_aligned_tag) { m_value = *ptr; } + KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr, vector_aligned_tag) { + m_value = *ptr; + } KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const { *ptr = m_value; } + KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, vector_aligned_tag) const { + *ptr = m_value; + } + KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) { return m_value; } @@ -224,7 +231,7 @@ template <typename T> using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>; return Experimental::simd<data_type, Experimental::simd_abi::scalar>( Kokkos::floor(static_cast<data_type>(a[0]))); -}; +} template <typename T> [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil( @@ -232,7 +239,7 @@ template <typename T> using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>; return Experimental::simd<data_type, Experimental::simd_abi::scalar>( Kokkos::ceil(static_cast<data_type>(a[0]))); -}; +} template <typename T> [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round( @@ -240,7 +247,7 @@ template <typename T> using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>; return Experimental::simd<data_type, Experimental::simd_abi::scalar>( Experimental::round_half_to_nearest_even(static_cast<data_type>(a[0]))); -}; +} template <typename T> [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc( @@ -248,7 +255,7 @@ template <typename T> using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>; return Experimental::simd<data_type, Experimental::simd_abi::scalar>( Kokkos::trunc(static_cast<data_type>(a[0]))); -}; +} template <class T> [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION @@ -308,6 +315,10 @@ class const_where_expression<simd_mask<T, simd_abi::scalar>, void copy_to(T* mem, element_aligned_tag) const { if (static_cast<bool>(m_mask)) *mem = static_cast<T>(m_value); } + KOKKOS_FORCEINLINE_FUNCTION + void copy_to(T* mem, vector_aligned_tag) const { + if (static_cast<bool>(m_mask)) *mem = static_cast<T>(m_value); + } template <class Integral> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<std::is_integral_v<Integral>> scatter_to(T* mem, simd<Integral, simd_abi::scalar> const& index) const { @@ -315,13 +326,13 @@ class const_where_expression<simd_mask<T, simd_abi::scalar>, mem[static_cast<Integral>(index)] = static_cast<T>(m_value); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& - impl_get_value() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION value_type const& impl_get_value() + const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& - impl_get_mask() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION mask_type const& impl_get_mask() + const { return m_mask; } }; @@ -344,6 +355,10 @@ class where_expression<simd_mask<T, simd_abi::scalar>, void copy_from(T const* mem, element_aligned_tag) { if (static_cast<bool>(this->m_mask)) this->m_value = *mem; } + KOKKOS_FORCEINLINE_FUNCTION + void copy_from(T const* mem, vector_aligned_tag) { + if (static_cast<bool>(this->m_mask)) this->m_value = *mem; + } template <class Integral> KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t<std::is_integral_v<Integral>> gather_from(T const* mem, simd<Integral, simd_abi::scalar> const& index) { diff --git a/packages/kokkos/simd/unit_tests/CMakeLists.txt b/packages/kokkos/simd/unit_tests/CMakeLists.txt index 75d557e8b525835a0fb8596bac4e683e55254102..2a0d6e614708b42ec6a80132ee0315f56af542cb 100644 --- a/packages/kokkos/simd/unit_tests/CMakeLists.txt +++ b/packages/kokkos/simd/unit_tests/CMakeLists.txt @@ -1,7 +1,5 @@ -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/simd/unit_tests/include) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/simd/unit_tests/include) -KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_SIMD - SOURCES - UnitTestMain.cpp - TestSIMD.cpp) +if(NOT (Kokkos_ENABLE_CUDA AND WIN32)) + kokkos_add_executable_and_test(UnitTest_SIMD SOURCES UnitTestMain.cpp TestSIMD.cpp) +endif() diff --git a/packages/kokkos/simd/unit_tests/TestSIMD.cpp b/packages/kokkos/simd/unit_tests/TestSIMD.cpp index 61c076e82466da2f31f2739383a42c7515d30594..df18b43c4e35b9899089be567a373de2f54becf5 100644 --- a/packages/kokkos/simd/unit_tests/TestSIMD.cpp +++ b/packages/kokkos/simd/unit_tests/TestSIMD.cpp @@ -21,3 +21,5 @@ #include <TestSIMD_Condition.hpp> #include <TestSIMD_GeneratorCtors.hpp> #include <TestSIMD_WhereExpressions.hpp> +#include <TestSIMD_Reductions.hpp> +#include <TestSIMD_Construction.hpp> diff --git a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp index 6529f20e66ac416707276bd1479104f8dd30bfc6..995074904feb567e2e354f6b74ed841bc4ee8c32 100644 --- a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -81,7 +81,13 @@ class absolutes { auto on_host(T const& a) const { if constexpr (std::is_signed_v<typename T::value_type>) { #if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif return Kokkos::Experimental::abs(a); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif #else return Kokkos::abs(a); #endif @@ -209,4 +215,165 @@ class shift_left { } }; +class cbrt_op { + public: + template <typename T> + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::cbrt(a); +#else + return Kokkos::cbrt(a); +#endif + } + template <typename T> + auto on_host_serial(T const& a) const { + return Kokkos::cbrt(a); + } +}; + +class exp_op { + public: + template <typename T> + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::exp(a); +#else + return Kokkos::exp(a); +#endif + } + template <typename T> + auto on_host_serial(T const& a) const { + return Kokkos::exp(a); + } +}; + +class log_op { + public: + template <typename T> + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::log(a); +#else + return Kokkos::log(a); +#endif + } + template <typename T> + auto on_host_serial(T const& a) const { + return Kokkos::log(a); + } +}; + +class hmin { + public: + template <typename T> + auto on_host(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template <typename T> + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity<DataType>::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } + + template <typename T> + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template <typename T> + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity<DataType>::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } +}; + +class hmax { + public: + template <typename T> + auto on_host(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template <typename T> + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity<DataType>::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } + + template <typename T> + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template <typename T> + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity<DataType>::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } +}; + +class reduce { + public: + template <typename T> + auto on_host(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template <typename T> + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity<DataType>::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } + + template <typename T> + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template <typename T> + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity<DataType>::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } +}; + #endif diff --git a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp index ae2ab2c697c5c1173f83b387bebe21b2e0f7b31b..9719855f0ffea946831f68f7110b27a23294b956 100644 --- a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp +++ b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp @@ -93,7 +93,7 @@ class load_element_aligned { bool host_load(T const* mem, std::size_t n, Kokkos::Experimental::simd<T, Abi>& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); return true; } template <class T, class Abi> @@ -101,7 +101,26 @@ class load_element_aligned { T const* mem, std::size_t n, Kokkos::Experimental::simd<T, Abi>& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); + return true; + } +}; + +class load_vector_aligned { + public: + template <class T, class Abi> + bool host_load(T const* mem, std::size_t n, + Kokkos::Experimental::simd<T, Abi>& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); + return true; + } + template <class T, class Abi> + KOKKOS_INLINE_FUNCTION bool device_load( + T const* mem, std::size_t n, + Kokkos::Experimental::simd<T, Abi>& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); return true; } }; @@ -116,9 +135,8 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); - where(!mask, result) = 0; + result = T(0); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); return true; } template <class T, class Abi> @@ -130,8 +148,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = T(0); return true; } @@ -164,4 +181,14 @@ class load_as_scalars { } }; +// Simple check to loosely test that T is a complete type. +// Some capabilities are only defined for specific data type and abi pairs (i.e. +// extended vector width); this is used to exclude pairs that +// are not defined from being tested. +template <typename T, typename = void> +constexpr bool is_type_v = false; + +template <typename T> +constexpr bool is_type_v<T, decltype(void(sizeof(T)))> = true; + #endif diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp index f8d8cc70fa4ed91e1731d2510d6a8f7e70685358..bf22cf3352b07baf853b67c6abb34f610c6d3f13 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Condition.hpp @@ -22,21 +22,23 @@ template <typename Abi, typename DataType> inline void host_check_condition() { - using simd_type = typename Kokkos::Experimental::simd<DataType, Abi>; - using mask_type = typename simd_type::mask_type; - - auto condition_op = [](mask_type const& mask, simd_type const& a, - simd_type const& b) { - return Kokkos::Experimental::condition(mask, a, b); - }; - - simd_type value_a(16); - simd_type value_b(20); - - auto condition_result = condition_op(mask_type(false), value_a, value_b); - EXPECT_TRUE(all_of(condition_result == value_b)); - condition_result = condition_op(mask_type(true), value_a, value_b); - EXPECT_TRUE(all_of(condition_result == value_a)); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = typename Kokkos::Experimental::simd<DataType, Abi>; + using mask_type = typename simd_type::mask_type; + + auto condition_op = [](mask_type const& mask, simd_type const& a, + simd_type const& b) { + return Kokkos::Experimental::condition(mask, a, b); + }; + + simd_type value_a(16); + simd_type value_b(20); + + auto condition_result = condition_op(mask_type(false), value_a, value_b); + EXPECT_TRUE(all_of(condition_result == value_b)); + condition_result = condition_op(mask_type(true), value_a, value_b); + EXPECT_TRUE(all_of(condition_result == value_a)); + } } template <typename Abi, typename... DataTypes> @@ -54,22 +56,24 @@ inline void host_check_condition_all_abis( template <typename Abi, typename DataType> KOKKOS_INLINE_FUNCTION void device_check_condition() { - using simd_type = typename Kokkos::Experimental::simd<DataType, Abi>; - using mask_type = typename simd_type::mask_type; - kokkos_checker checker; - - auto condition_op = [](mask_type const& mask, simd_type const& a, - simd_type const& b) { - return Kokkos::Experimental::condition(mask, a, b); - }; - - simd_type value_a(16); - simd_type value_b(20); - - auto condition_result = condition_op(mask_type(false), value_a, value_b); - checker.truth(all_of(condition_result == value_b)); - condition_result = condition_op(mask_type(true), value_a, value_b); - checker.truth(all_of(condition_result == value_a)); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = typename Kokkos::Experimental::simd<DataType, Abi>; + using mask_type = typename simd_type::mask_type; + kokkos_checker checker; + + auto condition_op = [](mask_type const& mask, simd_type const& a, + simd_type const& b) { + return Kokkos::Experimental::condition(mask, a, b); + }; + + simd_type value_a(16); + simd_type value_b(20); + + auto condition_result = condition_op(mask_type(false), value_a, value_b); + checker.truth(all_of(condition_result == value_b)); + condition_result = condition_op(mask_type(true), value_a, value_b); + checker.truth(all_of(condition_result == value_a)); + } } template <typename Abi, typename... DataTypes> diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0ceb1496c47d2d5e11d37d338d432a8abfc93175 --- /dev/null +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_CONSTRUCTION_HPP +#define KOKKOS_TEST_SIMD_CONSTRUCTION_HPP + +#include <Kokkos_SIMD.hpp> +#include <SIMDTesting_Utilities.hpp> + +template <typename Abi, typename DataType> +inline void host_test_simd_traits() { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + + static_assert(std::is_nothrow_default_constructible_v<simd_type>); + static_assert(std::is_nothrow_copy_assignable_v<simd_type>); + static_assert(std::is_nothrow_copy_constructible_v<simd_type>); + static_assert(std::is_nothrow_move_assignable_v<simd_type>); + static_assert(std::is_nothrow_move_constructible_v<simd_type>); + + simd_type default_simd, result; + simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + simd_type copy_simd(test_simd); + simd_type move_simd(std::move(copy_simd)); + default_simd = std::move(move_simd); + result = default_simd; + EXPECT_TRUE(all_of(test_simd == result)); +} + +template <typename Abi, typename DataType> +inline void host_test_mask_traits() { + using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>; + + static_assert(std::is_nothrow_default_constructible_v<mask_type>); + static_assert(std::is_nothrow_copy_assignable_v<mask_type>); + static_assert(std::is_nothrow_copy_constructible_v<mask_type>); + static_assert(std::is_nothrow_move_assignable_v<mask_type>); + static_assert(std::is_nothrow_move_constructible_v<mask_type>); + + mask_type default_mask, result; + mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + mask_type copy_mask(test_mask); + mask_type move_mask(std::move(copy_mask)); + default_mask = std::move(move_mask); + result = default_mask; + EXPECT_EQ(test_mask, result); +} + +template <typename Abi, typename DataType> +inline void host_check_construction() { + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + host_test_simd_traits<Abi, DataType>(); + host_test_mask_traits<Abi, DataType>(); + } +} + +template <typename Abi, typename... DataTypes> +inline void host_check_construction_all_types( + Kokkos::Experimental::Impl::data_types<DataTypes...>) { + (host_check_construction<Abi, DataTypes>(), ...); +} + +template <typename... Abis> +inline void host_check_construction_all_abis( + Kokkos::Experimental::Impl::abi_set<Abis...>) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_construction_all_types<Abis>(DataTypes()), ...); +} + +template <typename Abi, typename DataType> +KOKKOS_INLINE_FUNCTION void device_test_simd_traits() { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + + simd_type default_simd, result; + simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + simd_type copy_simd(test_simd); + simd_type move_simd(std::move(copy_simd)); + default_simd = std::move(move_simd); + result = default_simd; + + kokkos_checker checker; + checker.truth(all_of(test_simd == result)); +} + +template <typename Abi, typename DataType> +KOKKOS_INLINE_FUNCTION void device_test_mask_traits() { + using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>; + + mask_type default_mask, result; + mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + mask_type copy_mask(test_mask); + mask_type move_mask(std::move(copy_mask)); + default_mask = std::move(move_mask); + result = default_mask; + + kokkos_checker checker; + checker.truth(test_mask == result); +} + +template <typename Abi, typename DataType> +KOKKOS_INLINE_FUNCTION void device_check_construction() { + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + device_test_simd_traits<Abi, DataType>(); + device_test_mask_traits<Abi, DataType>(); + } +} + +template <typename Abi, typename... DataTypes> +KOKKOS_INLINE_FUNCTION void device_check_construction_all_types( + Kokkos::Experimental::Impl::data_types<DataTypes...>) { + (device_check_construction<Abi, DataTypes>(), ...); +} + +template <typename... Abis> +KOKKOS_INLINE_FUNCTION void device_check_construction_all_abis( + Kokkos::Experimental::Impl::abi_set<Abis...>) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_construction_all_types<Abis>(DataTypes()), ...); +} + +class simd_device_construction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_construction_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_construction) { + host_check_construction_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_construction) { + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1), + simd_device_construction_functor()); +} + +#endif diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp index b98871bbab80921628001b3dc4e6d1284bf4d10d..20b0729762c453382938c90eef99f066aa3e147d 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Conversions.hpp @@ -22,40 +22,42 @@ template <typename Abi> inline void host_check_conversions() { - { - auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); - auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a); - EXPECT_TRUE(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1); - auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a); - EXPECT_TRUE(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); - auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a); - EXPECT_TRUE(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd_mask<double, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a); - EXPECT_TRUE(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a); - EXPECT_TRUE(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a); - EXPECT_TRUE(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<double, Abi>(a); - EXPECT_TRUE(b == decltype(b)(true)); + if constexpr (is_type_v<Kokkos::Experimental::simd<uint64_t, Abi>>) { + { + auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); + auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1); + auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); + auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a); + EXPECT_TRUE(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd_mask<double, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a); + EXPECT_TRUE(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<double, Abi>(a); + EXPECT_TRUE(b == decltype(b)(true)); + } } } @@ -67,41 +69,43 @@ inline void host_check_conversions_all_abis( template <typename Abi> KOKKOS_INLINE_FUNCTION void device_check_conversions() { - kokkos_checker checker; - { - auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); - auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a); - checker.truth(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1); - auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a); - checker.truth(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); - auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a); - checker.truth(all_of(b == decltype(b)(1))); - } - { - auto a = Kokkos::Experimental::simd_mask<double, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a); - checker.truth(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a); - checker.truth(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a); - checker.truth(b == decltype(b)(true)); - } - { - auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); - auto b = Kokkos::Experimental::simd_mask<double, Abi>(a); - checker.truth(b == decltype(b)(true)); + if constexpr (is_type_v<Kokkos::Experimental::simd<uint64_t, Abi>>) { + kokkos_checker checker; + { + auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); + auto b = Kokkos::Experimental::simd<std::int64_t, Abi>(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd<std::int32_t, Abi>(1); + auto b = Kokkos::Experimental::simd<std::uint64_t, Abi>(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd<std::uint64_t, Abi>(1); + auto b = Kokkos::Experimental::simd<std::int32_t, Abi>(a); + checker.truth(all_of(b == decltype(b)(1))); + } + { + auto a = Kokkos::Experimental::simd_mask<double, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<std::uint64_t, Abi>(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<std::int64_t, Abi>(a); + checker.truth(b == decltype(b)(true)); + } + { + auto a = Kokkos::Experimental::simd_mask<std::int32_t, Abi>(true); + auto b = Kokkos::Experimental::simd_mask<double, Abi>(a); + checker.truth(b == decltype(b)(true)); + } } } diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4feff3a89d2846b93f55b7ef0933d692da757b7c..1a61fd9cbbb68083f794950b3a7aee0b83d1d6a5 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -22,46 +22,50 @@ template <typename Abi, typename DataType> inline void host_check_gen_ctor() { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - using mask_type = typename simd_type::mask_type; - constexpr std::size_t lanes = simd_type::size(); - - DataType init[lanes]; - DataType expected[lanes]; - mask_type init_mask(false); - - for (std::size_t i = 0; i < lanes; ++i) { - if (i % 3 == 0) init_mask[i] = true; - init[i] = 7; - expected[i] = (init_mask[i]) ? init[i] * 9 : init[i]; - } - - simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); - - simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); - - if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) { - simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); - host_check_equality(basic, rhs, lanes); - - simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); - mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; }); - simd_type result( - KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); - - host_check_equality(blend, result, lanes); - } else { - simd_type basic([=](std::size_t i) { return init[i]; }); - host_check_equality(basic, rhs, lanes); - - simd_type lhs([=](std::size_t i) { return init[i] * 9; }); - mask_type mask([=](std::size_t i) { return init_mask[i]; }); - simd_type result( - [=](std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); - - host_check_equality(blend, result, lanes); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + using mask_type = typename simd_type::mask_type; + constexpr std::size_t lanes = simd_type::size(); + + DataType init[lanes]; + DataType expected[lanes]; + mask_type init_mask(false); + + for (std::size_t i = 0; i < lanes; ++i) { + if (i % 3 == 0) init_mask[i] = true; + init[i] = 7; + expected[i] = (init_mask[i]) ? init[i] * 9 : init[i]; + } + + simd_type rhs; + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); + + simd_type blend; + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); + +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) + if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) { + simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); + host_check_equality(basic, rhs, lanes); + + simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); + mask_type mask(KOKKOS_LAMBDA(std::size_t i) { return init_mask[i]; }); + simd_type result( + KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + host_check_equality(blend, result, lanes); + } else { + simd_type basic([=](std::size_t i) { return init[i]; }); + host_check_equality(basic, rhs, lanes); + + simd_type lhs([=](std::size_t i) { return init[i] * 9; }); + mask_type mask([=](std::size_t i) { return init_mask[i]; }); + simd_type result( + [=](std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + host_check_equality(blend, result, lanes); + } +#endif } } @@ -80,32 +84,34 @@ inline void host_check_gen_ctors_all_abis( template <typename Abi, typename DataType> KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - using mask_type = typename simd_type::mask_type; - constexpr std::size_t lanes = simd_type::size(); - - DataType init[lanes]; - DataType expected[lanes]; - mask_type mask(false); - - for (std::size_t i = 0; i < lanes; ++i) { - if (i % 3 == 0) mask[i] = true; - init[i] = 7; - expected[i] = (mask[i]) ? init[i] * 9 : init[i]; - } + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + using mask_type = typename simd_type::mask_type; + constexpr std::size_t lanes = simd_type::size(); + + DataType init[lanes]; + DataType expected[lanes]; + mask_type mask(false); - simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); - simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); - device_check_equality(basic, rhs, lanes); + for (std::size_t i = 0; i < lanes; ++i) { + if (i % 3 == 0) mask[i] = true; + init[i] = 7; + expected[i] = (mask[i]) ? init[i] * 9 : init[i]; + } - simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); - simd_type result( - KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); + simd_type rhs; + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); + device_check_equality(basic, rhs, lanes); - simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); - device_check_equality(result, blend, lanes); + simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); + simd_type result( + KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); + + simd_type blend; + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); + device_check_equality(result, blend, lanes); + } } template <typename Abi, typename... DataTypes> diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp index a93c52e9a8d56b139668ffa717f73cdff017f3eb..c3d4ac594d0912bbe8d1e98edd1a31dfe99b5661 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_MaskOps.hpp @@ -22,25 +22,27 @@ template <typename Abi, typename DataType> inline void host_check_mask_ops() { - using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>; - - EXPECT_FALSE(none_of(mask_type(true))); - EXPECT_TRUE(none_of(mask_type(false))); - EXPECT_TRUE(all_of(mask_type(true))); - EXPECT_FALSE(all_of(mask_type(false))); - EXPECT_TRUE(any_of(mask_type(true))); - EXPECT_FALSE(any_of(mask_type(false))); - - for (std::size_t i = 0; i < mask_type::size(); ++i) { - mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); - - EXPECT_TRUE(any_of(test_mask)); - EXPECT_FALSE(none_of(test_mask)); - - if constexpr (mask_type::size() > 1) { - EXPECT_FALSE(all_of(test_mask)); - } else { - EXPECT_TRUE(all_of(test_mask)); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>; + + EXPECT_FALSE(none_of(mask_type(true))); + EXPECT_TRUE(none_of(mask_type(false))); + EXPECT_TRUE(all_of(mask_type(true))); + EXPECT_FALSE(all_of(mask_type(false))); + EXPECT_TRUE(any_of(mask_type(true))); + EXPECT_FALSE(any_of(mask_type(false))); + + for (std::size_t i = 0; i < mask_type::size(); ++i) { + mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); + + EXPECT_TRUE(any_of(test_mask)); + EXPECT_FALSE(none_of(test_mask)); + + if constexpr (mask_type::size() > 1) { + EXPECT_FALSE(all_of(test_mask)); + } else { + EXPECT_TRUE(all_of(test_mask)); + } } } } @@ -60,25 +62,27 @@ inline void host_check_mask_ops_all_abis( template <typename Abi, typename DataType> KOKKOS_INLINE_FUNCTION void device_check_mask_ops() { - using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>; - kokkos_checker checker; - checker.truth(!none_of(mask_type(true))); - checker.truth(none_of(mask_type(false))); - checker.truth(all_of(mask_type(true))); - checker.truth(!all_of(mask_type(false))); - checker.truth(any_of(mask_type(true))); - checker.truth(!any_of(mask_type(false))); - - for (std::size_t i = 0; i < mask_type::size(); ++i) { - mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); - - checker.truth(any_of(test_mask)); - checker.truth(!none_of(test_mask)); - - if constexpr (mask_type::size() > 1) { - checker.truth(!all_of(test_mask)); - } else { - checker.truth(all_of(test_mask)); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>; + kokkos_checker checker; + checker.truth(!none_of(mask_type(true))); + checker.truth(none_of(mask_type(false))); + checker.truth(all_of(mask_type(true))); + checker.truth(!all_of(mask_type(false))); + checker.truth(any_of(mask_type(true))); + checker.truth(!any_of(mask_type(false))); + + for (std::size_t i = 0; i < mask_type::size(); ++i) { + mask_type test_mask(KOKKOS_LAMBDA(std::size_t j) { return i == j; }); + + checker.truth(any_of(test_mask)); + checker.truth(!none_of(test_mask)); + + if constexpr (mask_type::size() > 1) { + checker.truth(!all_of(test_mask)); + } else { + checker.truth(all_of(test_mask)); + } } } } diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp index 802e41efe5f234dafaa0e42a9cba049bd5d11056..61ca852659a5a0016306250dab16988a2803ffd9 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -61,13 +61,18 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n, simd_type arg; bool const loaded_arg = loader.host_load(args + i, nlanes, arg); if (!loaded_arg) continue; - auto computed_result = unary_op.on_host(arg); - decltype(computed_result) expected_result; + decltype(unary_op.on_host(arg)) expected_result; for (std::size_t lane = 0; lane < simd_type::size(); ++lane) { - if (lane < nlanes) + if (lane < nlanes) { + if constexpr (std::is_same_v<UnaryOp, cbrt_op> || + std::is_same_v<UnaryOp, exp_op> || + std::is_same_v<UnaryOp, log_op>) + arg[lane] = Kokkos::abs(arg[lane]); expected_result[lane] = unary_op.on_host_serial(T(arg[lane])); + } } + auto computed_result = unary_op.on_host(arg); host_check_equality(expected_result, computed_result, nlanes); } } @@ -78,6 +83,7 @@ inline void host_check_math_op_all_loaders(Op op, std::size_t n, host_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...); host_check_math_op_one_loader<Abi, load_masked>(op, n, args...); host_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...); + host_check_math_op_one_loader<Abi, load_vector_aligned>(op, n, args...); } template <typename Abi, typename DataType, size_t n> @@ -96,6 +102,13 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n], // TODO: Place fallback implementations for all simd integer types if constexpr (std::is_floating_point_v<DataType>) { host_check_math_op_all_loaders<Abi>(divides(), n, first_args, second_args); + +#if defined(__INTEL_COMPILER) && \ + (defined(KOKKOS_ARCH_AVX2) || defined(KOKKOS_ARCH_AVX512XEON)) + host_check_math_op_all_loaders<Abi>(cbrt_op(), n, first_args); + host_check_math_op_all_loaders<Abi>(exp_op(), n, first_args); + host_check_math_op_all_loaders<Abi>(log_op(), n, first_args); +#endif } } @@ -108,25 +121,34 @@ inline void host_check_abi_size() { template <typename Abi, typename DataType> inline void host_check_math_ops() { - constexpr size_t n = 11; - - host_check_abi_size<Abi, DataType>(); - - if constexpr (!std::is_integral_v<DataType>) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; - host_check_all_math_ops<Abi>(first_args, second_args); - } else { - if constexpr (std::is_signed_v<DataType>) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + constexpr size_t alignment = + Kokkos::Experimental::simd<DataType, Abi>::size() * sizeof(DataType); + + host_check_abi_size<Abi, DataType>(); + + if constexpr (!std::is_integral_v<DataType>) { + alignas(alignment) DataType const first_args[] = { + 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, + 0.0, 1.2, -2.8, 3.0, 4.0, -0.1, 5.0, -0.2}; + alignas(alignment) DataType const second_args[] = { + 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, + 13.0, -3.2, -2.1, 3.0, -15.0, -0.5, -0.2, -0.2}; host_check_all_math_ops<Abi>(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; - host_check_all_math_ops<Abi>(first_args, second_args); + if constexpr (std::is_signed_v<DataType>) { + alignas(alignment) DataType const first_args[] = { + 1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2, -3, 7, 4, -9, -15}; + alignas(alignment) DataType const second_args[] = { + 1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2, 10, -15, 7, 2, -10}; + host_check_all_math_ops<Abi>(first_args, second_args); + } else { + alignas(alignment) DataType const first_args[] = { + 1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2, 11, 5, 8, 2, 14}; + alignas(alignment) DataType const second_args[] = { + 1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2, 3, 6, 20, 5, 14}; + host_check_all_math_ops<Abi>(first_args, second_args); + } } } } @@ -202,6 +224,7 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op, device_check_math_op_one_loader<Abi, load_element_aligned>(op, n, args...); device_check_math_op_one_loader<Abi, load_masked>(op, n, args...); device_check_math_op_one_loader<Abi, load_as_scalars>(op, n, args...); + device_check_math_op_one_loader<Abi, load_vector_aligned>(op, n, args...); } template <typename Abi, typename DataType, size_t n> @@ -233,25 +256,31 @@ KOKKOS_INLINE_FUNCTION void device_check_abi_size() { template <typename Abi, typename DataType> KOKKOS_INLINE_FUNCTION void device_check_math_ops() { - constexpr size_t n = 11; - - device_check_abi_size<Abi, DataType>(); - - if constexpr (!std::is_integral_v<DataType>) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; - device_check_all_math_ops<Abi>(first_args, second_args); - } else { - if constexpr (std::is_signed_v<DataType>) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + device_check_abi_size<Abi, DataType>(); + + if constexpr (!std::is_integral_v<DataType>) { + DataType const first_args[] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, + -2.0, 10.0, 0.0, 1.2, -2.8, 3.0, + 4.0, -0.1, 5.0, -0.2}; + DataType const second_args[] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, + -2.4, 1.0, 13.0, -3.2, -2.1, 3.0, + -15.0, -0.5, -0.2, -0.2}; device_check_all_math_ops<Abi>(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; - device_check_all_math_ops<Abi>(first_args, second_args); + if constexpr (std::is_signed_v<DataType>) { + DataType const first_args[] = {1, 2, -1, 10, 0, 1, -2, 10, + 0, 1, -2, -3, 7, 4, -9, -15}; + DataType const second_args[] = {1, 2, 1, 1, 1, -3, -2, 1, + 13, -3, -2, 10, -15, 7, 2, -10}; + device_check_all_math_ops<Abi>(first_args, second_args); + } else { + DataType const first_args[] = {1, 2, 1, 10, 0, 1, 2, 10, + 0, 1, 2, 11, 5, 8, 2, 14}; + DataType const second_args[] = {1, 2, 1, 1, 1, 3, 2, 1, + 13, 3, 2, 3, 6, 20, 5, 14}; + device_check_all_math_ops<Abi>(first_args, second_args); + } } } } @@ -282,8 +311,20 @@ TEST(simd, host_math_ops) { } TEST(simd, device_math_ops) { - Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1), - simd_device_math_ops_functor()); +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif +#if defined(KOKKOS_ENABLE_OPENACC) && \ + defined(KOKKOS_COMPILER_CLANG) // FIXME_CLACC + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_math_ops_functor()); } #endif diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..25fe25bc090e390ea2c8815fed82db389e9f3515 --- /dev/null +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -0,0 +1,199 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_REDUCTIONS_HPP +#define KOKKOS_TEST_SIMD_REDUCTIONS_HPP + +#include <Kokkos_SIMD.hpp> +#include <SIMDTesting_Utilities.hpp> + +template <typename Abi, typename Loader, typename ReductionOp, typename T> +inline void host_check_reduction_one_loader(ReductionOp reduce_op, + std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd<T, Abi>; + using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.host_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_host_serial(value); + auto computed = reduce_op.on_host(value); + + gtest_checker().equality(expected, computed); + } +} + +template <typename Abi, typename ReductionOp, typename T> +inline void host_check_reduction_all_loaders(ReductionOp reduce_op, + std::size_t n, T const* args) { + host_check_reduction_one_loader<Abi, load_element_aligned>(reduce_op, n, + args); + host_check_reduction_one_loader<Abi, load_masked>(reduce_op, n, args); + host_check_reduction_one_loader<Abi, load_as_scalars>(reduce_op, n, args); +} + +template <typename Abi, typename DataType, size_t n> +inline void host_check_all_reductions(const DataType (&args)[n]) { + host_check_reduction_all_loaders<Abi>(hmin(), n, args); + host_check_reduction_all_loaders<Abi>(hmax(), n, args); + host_check_reduction_all_loaders<Abi>(reduce(), n, args); +} + +template <typename Abi, typename DataType> +inline void host_check_reductions() { + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + constexpr size_t n = 16; + + if constexpr (std::is_signed_v<DataType>) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, + 0, 1, -2, -15, 5, 17, -22, 20}; + host_check_all_reductions<Abi>(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, + 0, 1, 2, 15, 5, 17, 22, 20}; + host_check_all_reductions<Abi>(args); + } + } +} + +template <typename Abi, typename... DataTypes> +inline void host_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types<DataTypes...>) { + (host_check_reductions<Abi, DataTypes>(), ...); +} + +template <typename... Abis> +inline void host_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set<Abis...>) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_reductions_all_types<Abis>(DataTypes()), ...); +} + +template <typename Abi, typename Loader, typename ReductionOp, typename T> +KOKKOS_INLINE_FUNCTION void device_check_reduction_one_loader( + ReductionOp reduce_op, std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd<T, Abi>; + using mask_type = typename Kokkos::Experimental::simd<T, Abi>::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.device_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_device_serial(value); + auto computed = reduce_op.on_device(value); + + kokkos_checker().equality(expected, computed); + } +} + +template <typename Abi, typename ReductionOp, typename T> +KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders( + ReductionOp reduce_op, std::size_t n, T const* args) { + device_check_reduction_one_loader<Abi, load_element_aligned>(reduce_op, n, + args); + device_check_reduction_one_loader<Abi, load_masked>(reduce_op, n, args); + device_check_reduction_one_loader<Abi, load_as_scalars>(reduce_op, n, args); +} + +template <typename Abi, typename DataType, size_t n> +KOKKOS_INLINE_FUNCTION void device_check_all_reductions( + const DataType (&args)[n]) { + device_check_reduction_all_loaders<Abi>(hmin(), n, args); + device_check_reduction_all_loaders<Abi>(hmax(), n, args); + device_check_reduction_all_loaders<Abi>(reduce(), n, args); +} + +template <typename Abi, typename DataType> +KOKKOS_INLINE_FUNCTION void device_check_reductions() { + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + constexpr size_t n = 16; + + if constexpr (std::is_signed_v<DataType>) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, + 0, 1, -2, -15, 5, 17, -22, 20}; + device_check_all_reductions<Abi>(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, + 0, 1, 2, 15, 5, 17, 22, 20}; + device_check_all_reductions<Abi>(args); + } + } +} + +template <typename Abi, typename... DataTypes> +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types<DataTypes...>) { + (device_check_reductions<Abi, DataTypes>(), ...); +} + +template <typename... Abis> +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set<Abis...>) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_reductions_all_types<Abis>(DataTypes()), ...); +} + +class simd_device_reduction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_reductions_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_reductions) { + host_check_reductions_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_reductions) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif +#if defined(KOKKOS_ENABLE_OPENACC) && \ + defined(KOKKOS_COMPILER_CLANG) // FIXME_CLACC + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_reduction_functor()); +} + +#endif diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp index f6fdcb920ed22ed89397cc35eaea03c1cb05b7f1..7329f085018c87c96186051f74f0150d82d100d7 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp @@ -85,10 +85,11 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by, n); host_check_shift_on_one_loader<Abi, load_as_scalars>(shift_op, test_vals, shift_by, n); + host_check_shift_on_one_loader<Abi, load_vector_aligned>(shift_op, test_vals, + shift_by, n); Kokkos::Experimental::simd<DataType, Abi> shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); host_check_shift_by_lanes_on_one_loader<Abi, load_element_aligned>( shift_op, test_vals, shift_by_lanes); @@ -96,36 +97,41 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by_lanes); host_check_shift_by_lanes_on_one_loader<Abi, load_as_scalars>( shift_op, test_vals, shift_by_lanes); + host_check_shift_by_lanes_on_one_loader<Abi, load_vector_aligned>( + shift_op, test_vals, shift_by_lanes); } template <typename Abi, typename DataType> inline void host_check_shift_ops() { - if constexpr (std::is_integral_v<DataType>) { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - constexpr std::size_t width = simd_type::size(); - constexpr std::size_t num_cases = 8; - - DataType max = std::numeric_limits<DataType>::max(); - - DataType shift_by[num_cases] = { - 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; - for (std::size_t i = 0; i < width; ++i) { - DataType inc = max / width; - test_vals[i] = i * inc + 1; - } - - host_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by, - num_cases); - host_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by, - num_cases); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + if constexpr (std::is_integral_v<DataType>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + constexpr std::size_t width = simd_type::size(); + constexpr std::size_t num_cases = 16; + constexpr size_t alignment = + Kokkos::Experimental::simd<DataType, Abi>::size() * sizeof(DataType); + + DataType max = std::numeric_limits<DataType>::max(); + + alignas(alignment) DataType shift_by[num_cases] = { + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1, + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; + alignas(alignment) DataType test_vals[width]; + for (std::size_t i = 0; i < width; ++i) { + DataType inc = max / width; + test_vals[i] = i * inc + 1; + } - if constexpr (std::is_signed_v<DataType>) { - for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; host_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by, num_cases); host_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by, num_cases); + + if constexpr (std::is_signed_v<DataType>) { + for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; + host_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by, + num_cases); + } } } } @@ -201,10 +207,11 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_by, n); device_check_shift_on_one_loader<Abi, load_as_scalars>(shift_op, test_vals, shift_by, n); + device_check_shift_on_one_loader<Abi, load_vector_aligned>( + shift_op, test_vals, shift_by, n); Kokkos::Experimental::simd<DataType, Abi> shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); device_check_shift_by_lanes_on_one_loader<Abi, load_element_aligned>( shift_op, test_vals, shift_by_lanes); @@ -212,37 +219,40 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_op, test_vals, shift_by_lanes); device_check_shift_by_lanes_on_one_loader<Abi, load_as_scalars>( shift_op, test_vals, shift_by_lanes); + device_check_shift_by_lanes_on_one_loader<Abi, load_vector_aligned>( + shift_op, test_vals, shift_by_lanes); } template <typename Abi, typename DataType> KOKKOS_INLINE_FUNCTION void device_check_shift_ops() { - if constexpr (std::is_integral_v<DataType>) { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - constexpr std::size_t width = simd_type::size(); - constexpr std::size_t num_cases = 8; - - DataType max = Kokkos::reduction_identity<DataType>::max(); + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + if constexpr (std::is_integral_v<DataType>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + constexpr std::size_t width = simd_type::size(); + constexpr std::size_t num_cases = 16; - DataType shift_by[num_cases] = { - 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; + DataType max = Kokkos::reduction_identity<DataType>::max(); - for (std::size_t i = 0; i < width; ++i) { - DataType inc = max / width; - test_vals[i] = i * inc + 1; - } + DataType shift_by[num_cases] = { + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1, + 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; + DataType test_vals[width]; - device_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by, - num_cases); - device_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by, - num_cases); + for (std::size_t i = 0; i < width; ++i) { + DataType inc = max / width; + test_vals[i] = i * inc + 1; + } - if constexpr (std::is_signed_v<DataType>) { - for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; device_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, shift_by, num_cases); device_check_shift_op_all_loaders<Abi>(shift_left(), test_vals, shift_by, num_cases); + + if constexpr (std::is_signed_v<DataType>) { + for (std::size_t i = 0; i < width; ++i) test_vals[i] *= -1; + device_check_shift_op_all_loaders<Abi>(shift_right(), test_vals, + shift_by, num_cases); + } } } } diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp index 129f2b0d5c90c89e8b8c485f23acf8dc7385ee82..9179b049e34539471060c9a37cb6cc951014c082 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp @@ -22,60 +22,66 @@ template <typename Abi, typename DataType> inline void host_check_where_expr_scatter_to() { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>; - using mask_type = typename simd_type::mask_type; - - std::size_t nlanes = simd_type::size(); - DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; - simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); - - for (std::size_t idx = 0; idx < nlanes; ++idx) { - mask_type mask(true); - mask[idx] = false; - - DataType dst[8] = {0}; - index_type index; - simd_type expected_result; - for (std::size_t i = 0; i < nlanes; ++i) { - dst[i] = (2 + (i * 2)); - index[i] = i; - expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>; + using mask_type = typename simd_type::mask_type; + + std::size_t nlanes = simd_type::size(); + DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; + simd_type src; + src.copy_from(init, Kokkos::Experimental::simd_flag_default); + + for (std::size_t idx = 0; idx < nlanes; ++idx) { + mask_type mask(true); + mask[idx] = false; + + DataType dst[simd_type::size()] = {0}; + index_type index; + simd_type expected_result; + for (std::size_t i = 0; i < nlanes; ++i) { + dst[i] = (2 + (i * 2)); + index[i] = i; + expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + } + where(mask, src).scatter_to(dst, index); + + simd_type dst_simd; + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); + + host_check_equality(expected_result, dst_simd, nlanes); } - where(mask, src).scatter_to(dst, index); - - simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); - - host_check_equality(expected_result, dst_simd, nlanes); } } template <typename Abi, typename DataType> inline void host_check_where_expr_gather_from() { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>; - using mask_type = typename simd_type::mask_type; - - std::size_t nlanes = simd_type::size(); - DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37}; - - for (std::size_t idx = 0; idx < nlanes; ++idx) { - mask_type mask(true); - mask[idx] = false; - - simd_type dst; - index_type index; - simd_type expected_result; - for (std::size_t i = 0; i < nlanes; ++i) { - dst[i] = (2 + (i * 2)); - index[i] = i; - expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>; + using mask_type = typename simd_type::mask_type; + + std::size_t nlanes = simd_type::size(); + DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; + + for (std::size_t idx = 0; idx < nlanes; ++idx) { + mask_type mask(true); + mask[idx] = false; + + simd_type dst; + index_type index; + simd_type expected_result; + for (std::size_t i = 0; i < nlanes; ++i) { + dst[i] = (2 + (i * 2)); + index[i] = i; + expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + } + where(mask, dst).gather_from(src, index); + + host_check_equality(expected_result, dst, nlanes); } - where(mask, dst).gather_from(src, index); - - host_check_equality(expected_result, dst, nlanes); } } @@ -100,33 +106,36 @@ inline void host_check_where_expr_all_abis( template <typename Abi, typename DataType> KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { - using simd_type = Kokkos::Experimental::simd<DataType, Abi>; - using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>; - using mask_type = typename simd_type::mask_type; - - std::size_t nlanes = simd_type::size(); - DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; - simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); - - for (std::size_t idx = 0; idx < nlanes; ++idx) { - mask_type mask(true); - mask[idx] = false; - - DataType dst[8] = {0}; - index_type index; - simd_type expected_result; - for (std::size_t i = 0; i < nlanes; ++i) { - dst[i] = (2 + (i * 2)); - index[i] = i; - expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) { + using simd_type = Kokkos::Experimental::simd<DataType, Abi>; + using index_type = Kokkos::Experimental::simd<std::int32_t, Abi>; + using mask_type = typename simd_type::mask_type; + + std::size_t nlanes = simd_type::size(); + DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; + simd_type src; + src.copy_from(init, Kokkos::Experimental::simd_flag_default); + + for (std::size_t idx = 0; idx < nlanes; ++idx) { + mask_type mask(true); + mask[idx] = false; + + DataType dst[simd_type::size()] = {0}; + index_type index; + simd_type expected_result; + for (std::size_t i = 0; i < nlanes; ++i) { + dst[i] = (2 + (i * 2)); + index[i] = i; + expected_result[i] = (mask[i]) ? src[index[i]] : dst[i]; + } + where(mask, src).scatter_to(dst, index); + + simd_type dst_simd; + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); + + device_check_equality(expected_result, dst_simd, nlanes); } - where(mask, src).scatter_to(dst, index); - - simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); - - device_check_equality(expected_result, dst_simd, nlanes); } } @@ -137,7 +146,8 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_gather_from() { using mask_type = typename simd_type::mask_type; std::size_t nlanes = simd_type::size(); - DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37}; + DataType src[] = {11, 13, 17, 19, 23, 29, 31, 37, + 53, 71, 79, 83, 89, 93, 97, 103}; for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); diff --git a/packages/kokkos/tpls/.clang-format b/packages/kokkos/tpls/.clang-format index 743216e523eae0566aea6018925309936bf787d3..9d159247d518108410702980b90b13c2cfb4b84f 100644 --- a/packages/kokkos/tpls/.clang-format +++ b/packages/kokkos/tpls/.clang-format @@ -1,3 +1,2 @@ -#Official Tool: clang-format version 8.0.0 DisableFormat: true SortIncludes: false diff --git a/packages/kokkos/tpls/desul/Config.hpp.cmake.in b/packages/kokkos/tpls/desul/Config.hpp.cmake.in index a7bc738191e781e4f73f5cb86c727d2112b228f0..aed7ecfabc969808f36dea00caa20ef4b37b9b8a 100644 --- a/packages/kokkos/tpls/desul/Config.hpp.cmake.in +++ b/packages/kokkos/tpls/desul/Config.hpp.cmake.in @@ -14,6 +14,8 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_HIP #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP +#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0eab27fe989fa792487c277630e41abcb1828555 --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_HIP.hpp @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_ADAPT_HIP_HPP_ +#define DESUL_ATOMICS_ADAPT_HIP_HPP_ + +#include <desul/atomics/Common.hpp> + +namespace desul { +namespace Impl { + +// FIXME same code as GCCMemoryOrder +template <class MemoryOrder> +struct HIPMemoryOrder; + +template <> +struct HIPMemoryOrder<MemoryOrderRelaxed> { + static constexpr int value = __ATOMIC_RELAXED; +}; + +template <> +struct HIPMemoryOrder<MemoryOrderAcquire> { + static constexpr int value = __ATOMIC_ACQUIRE; +}; + +template <> +struct HIPMemoryOrder<MemoryOrderRelease> { + static constexpr int value = __ATOMIC_RELEASE; +}; + +template <> +struct HIPMemoryOrder<MemoryOrderAcqRel> { + static constexpr int value = __ATOMIC_ACQ_REL; +}; + +template <> +struct HIPMemoryOrder<MemoryOrderSeqCst> { + static constexpr int value = __ATOMIC_SEQ_CST; +}; + +// __HIP_MEMORY_SCOPE_SYSTEM +// __HIP_MEMORY_SCOPE_AGENT +// __HIP_MEMORY_SCOPE_WORKGROUP +// __HIP_MEMORY_SCOPE_WAVEFRONT +// __HIP_MEMORY_SCOPE_SINGLETHREAD +template <class MemoryScope> +struct HIPMemoryScope; + +template <> +struct HIPMemoryScope<MemoryScopeCore> { + static constexpr int value = __HIP_MEMORY_SCOPE_WORKGROUP; +}; + +template <> +struct HIPMemoryScope<MemoryScopeDevice> { + static constexpr int value = __HIP_MEMORY_SCOPE_AGENT; +}; + +template <> +struct HIPMemoryScope<MemoryScopeNode> { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +template <> +struct HIPMemoryScope<MemoryScopeSystem> { + static constexpr int value = __HIP_MEMORY_SCOPE_SYSTEM; +}; + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp index 082fc132de53caecd5735a3b9d617edee5feb463..15c6d78d94bf0635b4e39eb1848d660ecc1fd0c9 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp @@ -88,15 +88,18 @@ using sycl_atomic_ref = sycl::atomic_ref<T, sycl::access::address_space::generic_space>; #endif -// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead #ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible. +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL +template <class T> +using sycl_device_global = sycl::ext::oneapi::experimental::device_global<T>; +#else template <class T> using sycl_device_global = sycl::ext::oneapi::experimental::device_global< T, decltype(sycl::ext::oneapi::experimental::properties( sycl::ext::oneapi::experimental::device_image_scope))>; #endif +#endif } // namespace Impl } // namespace desul diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp index 3d69dcf6c50ce0ab034f125ac979bd37a4ac1a56..149876a49f1f058cb81d061a8ea7c6b6572f743f 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Atomic_Ref.hpp @@ -6,533 +6,111 @@ Source: https://github.com/desul/desul SPDX-License-Identifier: (BSD-3-Clause) */ -#ifndef DESUL_ATOMIC_REF_IMPL_HPP_ -#define DESUL_ATOMIC_REF_IMPL_HPP_ +#ifndef DESUL_ATOMIC_REF_HPP_ +#define DESUL_ATOMIC_REF_HPP_ -#include <cstddef> #include <desul/atomics/Common.hpp> #include <desul/atomics/Generic.hpp> #include <desul/atomics/Macros.hpp> -#include <memory> -#include <type_traits> namespace desul { -namespace Impl { -// TODO current implementation is missing the following: -// * member functions -// * wait -// * notify_one -// * notify_all - -template <typename T, - typename MemoryOrder, - typename MemoryScope, - bool = std::is_integral<T>{}, - bool = std::is_floating_point<T>{}> -struct basic_atomic_ref; - -// base class for non-integral, non-floating-point, non-pointer types -template <typename T, typename MemoryOrder, typename MemoryScope> -struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> { - static_assert(std::is_trivially_copyable<T>{}, ""); - - private: - T* _ptr; - - // 1/2/4/8/16-byte types must be aligned to at least their size - static constexpr int _min_alignment = (sizeof(T) & (sizeof(T) - 1)) || sizeof(T) > 16 - ? 0 - : sizeof(T); - - public: - using value_type = T; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = _min_alignment > alignof(T) - ? _min_alignment - : alignof(T); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - basic_atomic_ref(basic_atomic_ref const&) = default; - - explicit basic_atomic_ref(T& obj) : _ptr(std::addressof(obj)) {} - - T operator=(T desired) const noexcept { - this->store(desired); - return desired; - } - - operator T() const noexcept { return this->load(); } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION void store(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T exchange(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free<sizeof(T), required_alignment>(); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak(T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } -}; - -// base class for atomic_ref<integral-type> -template <typename T, typename MemoryOrder, typename MemoryScope> -struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> { - static_assert(std::is_integral<T>{}, ""); - - private: - T* _ptr; - - public: - using value_type = T; - using difference_type = value_type; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = sizeof(T) > alignof(T) ? sizeof(T) - : alignof(T); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - explicit basic_atomic_ref(T& obj) : _ptr(&obj) {} - - basic_atomic_ref(basic_atomic_ref const&) = default; - - T operator=(T desired) const noexcept { - this->store(desired); - return desired; - } - - operator T() const noexcept { return this->load(); } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION void store(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T exchange(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free<sizeof(T), required_alignment>(); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak(T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_add(_ptr, arg, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_sub(_ptr, arg, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_and(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_and(_ptr, arg, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_or(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_or(_ptr, arg, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_xor(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_xor(_ptr, arg, order, MemoryScope()); - } - - DESUL_FUNCTION value_type operator++() const noexcept { - return atomic_add_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); } - - DESUL_FUNCTION value_type operator--() const noexcept { - return atomic_sub_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); } - - DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept { - atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept { - atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator&=(value_type arg) const noexcept { - atomic_and_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator|=(value_type arg) const noexcept { - atomic_or_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator^=(value_type arg) const noexcept { - atomic_xor_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } -}; - -// base class for atomic_ref<floating-point-type> template <typename T, typename MemoryOrder, typename MemoryScope> -struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> { - static_assert(std::is_floating_point<T>{}, ""); - - private: - T* _ptr; +class AtomicRef { + T* ptr_; public: using value_type = T; - using difference_type = value_type; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = alignof(T); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - explicit basic_atomic_ref(T& obj) : _ptr(&obj) {} - - basic_atomic_ref(basic_atomic_ref const&) = default; - - T operator=(T desired) const noexcept { - this->store(desired); - return desired; - } - - operator T() const noexcept { return this->load(); } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION void store(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T exchange(T desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); - } - - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free<sizeof(T), required_alignment>(); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak(T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, - T desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } + using memory_order = MemoryOrder; + using memory_scope = MemoryScope; - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_add(_ptr, arg, order, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_sub(_ptr, arg, order, MemoryScope()); - } - - DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept { - atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } - - DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept { - atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope()); - } -}; + DESUL_FUNCTION explicit AtomicRef(T& obj) : ptr_(&obj) {} -// base class for atomic_ref<pointer-type> -template <typename T, typename MemoryOrder, typename MemoryScope> -struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> { - private: - T** _ptr; - - public: - using value_type = T*; - using difference_type = std::ptrdiff_t; - - static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T)); - - static constexpr std::size_t required_alignment = alignof(T*); - - basic_atomic_ref() = delete; - basic_atomic_ref& operator=(basic_atomic_ref const&) = delete; - - explicit basic_atomic_ref(T*& arg) : _ptr(std::addressof(arg)) {} - - basic_atomic_ref(basic_atomic_ref const&) = default; - - T* operator=(T* desired) const noexcept { - this->store(desired); + DESUL_FUNCTION T operator=(T desired) const noexcept { + store(desired); return desired; } - operator T*() const noexcept { return this->load(); } + DESUL_FUNCTION operator T() const noexcept { return load(); } - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION void store(T* desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - atomic_store(_ptr, desired, order, MemoryScope()); + DESUL_FUNCTION T load() const noexcept { + return desul::atomic_load(ptr_, MemoryOrder(), MemoryScope()); } - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T* load(_MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, order, MemoryScope()); + DESUL_FUNCTION void store(T desired) const noexcept { + return desul::atomic_store(ptr_, desired, MemoryOrder(), MemoryScope()); } - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION T* exchange(T* desired, - _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_load(_ptr, desired, order, MemoryScope()); + DESUL_FUNCTION T exchange(T desired) const noexcept { + return desul::atomic_exchange(ptr_, desired, MemoryOrder(), MemoryScope()); } - DESUL_FUNCTION bool is_lock_free() const noexcept { - return atomic_is_lock_free<sizeof(T*), required_alignment>(); - } + // TODO compare_exchange_{weak,strong} and is_lock_free - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak(T*& expected, - T* desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_weak( - _ptr, expected, desired, success, failure, MemoryScope()); +#define DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(FETCH_OP, OP_FETCH) \ + DESUL_FUNCTION T FETCH_OP(T arg) const noexcept { \ + return desul::atomic_##FETCH_OP(ptr_, arg, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T OP_FETCH(T arg) const noexcept { \ + return desul::atomic_##OP_FETCH(ptr_, arg, MemoryOrder(), MemoryScope()); \ } - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_weak( - T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_weak(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } - - template <typename SuccessMemoryOrder, typename FailureMemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T*& expected, - T* desired, - SuccessMemoryOrder success, - FailureMemoryOrder failure) const noexcept { - return atomic_compare_exchange_strong( - _ptr, expected, desired, success, failure, MemoryScope()); - } - - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION bool compare_exchange_strong( - T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept { - return compare_exchange_strong(expected, - desired, - order, - cmpexch_failure_memory_order<_MemoryOrder>(), - MemoryScope()); - } +#define DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(COMPD_ASGMT, OP_FETCH) \ + DESUL_FUNCTION T operator COMPD_ASGMT(T arg) const noexcept { return OP_FETCH(arg); } - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_add(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_add(_ptr, _type_size(d), order, MemoryScope()); - } + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_add, add_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(+=, add_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_sub, sub_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(-=, sub_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_min, min_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_max, max_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_mul, mul_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(*=, mul_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_div, div_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(/=, div_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_mod, mod_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(%=, mod_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_and, and_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(&=, and_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_or, or_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(|=, or_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_xor, xor_fetch) + DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP(^=, xor_fetch) + DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP(fetch_nand, nand_fetch) - template <typename _MemoryOrder = MemoryOrder> - DESUL_FUNCTION value_type - fetch_sub(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept { - return atomic_fetch_sub(_ptr, _type_size(d), order, MemoryScope()); - } +#undef DESUL_IMPL_DEFINE_ATOMIC_COMPOUND_ASSIGNMENT_OP +#undef DESUL_IMPL_DEFINE_ATOMIC_FETCH_OP - DESUL_FUNCTION value_type operator++() const noexcept { - return atomic_add_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope()); - } +#define DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT(OPER, NAME) \ + DESUL_FUNCTION T fetch_##NAME() const noexcept { \ + return desul::atomic_fetch_##NAME(ptr_, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T NAME##_fetch() const noexcept { \ + return desul::atomic_##NAME##_fetch(ptr_, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T operator OPER() const noexcept { return NAME##_fetch(); } \ + DESUL_FUNCTION T operator OPER(int) const noexcept { return fetch_##NAME(); } - DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); } - - DESUL_FUNCTION value_type operator--() const noexcept { - return atomic_sub_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope()); - } + DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT(++, inc) + DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT(--, dec) - DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); } +#undef DESUL_IMPL_DEFINE_ATOMIC_INCREMENT_DECREMENT - DESUL_FUNCTION value_type operator+=(difference_type d) const noexcept { - atomic_add_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope()); +#define DESUL_IMPL_DEFINE_ATOMIC_BITWISE_SHIFT(COMPD_ASGMT, SHFT) \ + DESUL_FUNCTION T fetch_##SHFT(unsigned int arg) const noexcept { \ + return desul::atomic_fetch_##SHFT(ptr_, arg, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T SHFT##_fetch(unsigned int arg) const noexcept { \ + return desul::atomic_##SHFT##_fetch(ptr_, arg, MemoryOrder(), MemoryScope()); \ + } \ + DESUL_FUNCTION T operator COMPD_ASGMT(unsigned int arg) const noexcept { \ + return SHFT##_fetch(arg); \ } - DESUL_FUNCTION value_type operator-=(difference_type d) const noexcept { - atomic_sub_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope()); - } - - private: - static constexpr std::ptrdiff_t _type_size(std::ptrdiff_t d) noexcept { - static_assert(std::is_object<T>{}, ""); - return d * sizeof(T); - } -}; - -} // namespace Impl - -template <typename T, typename MemoryOrder, typename MemoryScope> -struct scoped_atomic_ref : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope> { - explicit scoped_atomic_ref(T& obj) noexcept - : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>(obj) {} - - scoped_atomic_ref& operator=(scoped_atomic_ref const&) = delete; - - scoped_atomic_ref(scoped_atomic_ref const&) = default; + DESUL_IMPL_DEFINE_ATOMIC_BITWISE_SHIFT(<<=, lshift) + DESUL_IMPL_DEFINE_ATOMIC_BITWISE_SHIFT(>>=, rshift) - using Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>::operator=; +#undef DESUL_IMPL_DEFINE_ATOMIC_BITWISE_SHIFT }; } // namespace desul diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp index e91569e1dee8e6073f06e018aa5d75b0e13075d3..72639fc49322f6b7b19a3042d0384a85c0179642 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include <desul/atomics/Compare_Exchange_OpenMP.hpp> #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include <desul/atomics/Compare_Exchange_OpenACC.hpp> +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include <desul/atomics/Compare_Exchange_SYCL.hpp> #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp index 8c909bacdf41d10bc3d342ef740f339c107e132e..0ade34f25dfea3e367fbe02e75bf73b51fcd3905 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_HIP.hpp @@ -9,6 +9,7 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ #define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_ +#include <desul/atomics/Adapt_HIP.hpp> #include <desul/atomics/Common.hpp> #include <desul/atomics/Lock_Array_HIP.hpp> #include <desul/atomics/Thread_Fence_HIP.hpp> @@ -17,130 +18,40 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest), - reinterpret_cast<unsigned int&>(compare), - reinterpret_cast<unsigned int&>(value)); - return reinterpret_cast<T&>(return_val); -} -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 8, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicCAS(reinterpret_cast<unsigned long long int*>(dest), - reinterpret_cast<unsigned long long int&>(compare), - reinterpret_cast<unsigned long long int&>(value)); - return reinterpret_cast<T&>(return_val); -} +template <class T> +struct atomic_exchange_available_hip { + constexpr static bool value = + ((sizeof(T) == 1 && alignof(T) == 1) || (sizeof(T) == 4 && alignof(T) == 4) || + (sizeof(T) == 8 && alignof(T) == 8)) && + std::is_trivially_copyable<T>::value; +}; -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - return return_val; -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> +template <class T, class MemoryOrder, class MemoryScope> +__device__ std::enable_if_t<atomic_exchange_available_hip<T>::value, T> device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) { - atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return return_val; + T* const dest, T compare, T value, MemoryOrder, MemoryScope) { + (void)__hip_atomic_compare_exchange_strong( + dest, + &compare, + value, + HIPMemoryOrder<MemoryOrder>::value, + HIPMemoryOrder<cmpexch_failure_memory_order<MemoryOrder>>::value, + HIPMemoryScope<MemoryScope>::value); + return compare; } -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4, T> device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned int) == 4, - "this function assumes an unsigned int is 32-bit"); - unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest), - reinterpret_cast<unsigned int&>(value)); - return reinterpret_cast<T&>(return_val); -} -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 8, T> device_atomic_exchange( - T* const dest, T value, MemoryOrderRelaxed, MemoryScope) { - static_assert(sizeof(unsigned long long int) == 8, - "this function assumes an unsigned long long is 64-bit"); - unsigned long long int return_val = - atomicExch(reinterpret_cast<unsigned long long int*>(dest), - reinterpret_cast<unsigned long long int&>(value)); - return reinterpret_cast<T&>(return_val); -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> device_atomic_exchange( - T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) { - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast<T&>(return_val); -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> device_atomic_exchange( - T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - return reinterpret_cast<T&>(return_val); -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> device_atomic_exchange( - T* const dest, T value, MemoryOrderAcqRel, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast<T&>(return_val); -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> device_atomic_exchange( - T* const dest, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = - device_atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); - return reinterpret_cast<T&>(return_val); -} - -template <class T, class MemoryScope> -__device__ std::enable_if_t<sizeof(T) == 4 || sizeof(T) == 8, T> -device_atomic_compare_exchange( - T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) { - device_atomic_thread_fence(MemoryOrderAcquire(), MemoryScope()); - T return_val = device_atomic_compare_exchange( - dest, compare, value, MemoryOrderRelaxed(), MemoryScope()); - device_atomic_thread_fence(MemoryOrderRelease(), MemoryScope()); +template <class T, class MemoryOrder, class MemoryScope> +__device__ std::enable_if_t<atomic_exchange_available_hip<T>::value, T> +device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope) { + T return_val = __hip_atomic_exchange(dest, + value, + HIPMemoryOrder<MemoryOrder>::value, + HIPMemoryScope<MemoryScope>::value); return return_val; } template <class T, class MemoryOrder, class MemoryScope> -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t<!atomic_exchange_available_hip<T>::value, T> device_atomic_compare_exchange( T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front @@ -169,7 +80,7 @@ device_atomic_compare_exchange( } template <class T, class MemoryOrder, class MemoryScope> -__device__ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> +__device__ std::enable_if_t<!atomic_exchange_available_hip<T>::value, T> device_atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) { // This is a way to avoid deadlock in a warp or wave front T return_val; diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp new file mode 100644 index 0000000000000000000000000000000000000000..77149bd47419a22127063044c6078bf5e2393f00 --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -0,0 +1,153 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ +#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ + +#include <openacc.h> + +#include <desul/atomics/Common.hpp> +#include <desul/atomics/Thread_Fence_OpenACC.hpp> +#include <type_traits> + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +#pragma acc routine seq +template <class T, class MemoryOrder, class MemoryScope> +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { + if constexpr (std::is_arithmetic_v<T> && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template <class T, class MemoryOrder, class MemoryScope> +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // Floating point types treated separetely to work around compiler errors + // "parse invalid cast opcode for cast from 'i32' to 'float'". + // Also not just "forwarding" arguments to atomicCAS because it does not have an + // overload that takes int64_t + if constexpr (std::is_integral_v<T> && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + static_assert(sizeof(unsigned int) == 4); + static_assert(sizeof(unsigned long long int) == 8); + using cas_t = + std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>; + cas_t return_val = atomicCAS(reinterpret_cast<cas_t*>(dest), + reinterpret_cast<cas_t&>(compare), + reinterpret_cast<cas_t&>(value)); + return reinterpret_cast<T&>(return_val); +#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL + } else if constexpr (std::is_same_v<T, float>) { +#else + } else if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) { +#endif + return atomicCAS(dest, compare, value); + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; + } +} + +#else // not NVHPC + +#pragma acc routine seq +template <class T, class MemoryOrder, class MemoryScope> +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) { + if constexpr (std::is_arithmetic_v<T>) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template <class T, class MemoryOrder, class MemoryScope> +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation " + "in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; +} + +#endif + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp index 43b4fb56f9cafbd0722684ad04a9909dbf1fd618..4c136fb364a75e9993c8799371c5acf97decc20b 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp @@ -83,7 +83,11 @@ device_atomic_compare_exchange( // This is a way to avoid deadlock in a subgroup T return_val; int done = 0; +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto sg = sycl::ext::oneapi::this_work_item::get_sub_group(); +#else auto sg = sycl::ext::oneapi::experimental::this_sub_group(); +#endif using sycl::ext::oneapi::group_ballot; using sycl::ext::oneapi::sub_group_mask; sub_group_mask active = group_ballot(sg, 1); @@ -114,7 +118,11 @@ std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> device_atomic_exchange // This is a way to avoid deadlock in a subgroup T return_val; int done = 0; +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto sg = sycl::ext::oneapi::this_work_item::get_sub_group(); +#else auto sg = sycl::ext::oneapi::experimental::this_sub_group(); +#endif using sycl::ext::oneapi::group_ballot; using sycl::ext::oneapi::sub_group_mask; sub_group_mask active = group_ballot(sg, 1); diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp index adf75c574371d9946f4d3d9c345f38ee549397c5..1b161397c74bae6522bb3d99f2892f75f86190c8 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp @@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include <desul/atomics/Fetch_Op_OpenMP.hpp> #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include <desul/atomics/Fetch_Op_OpenACC.hpp> +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include <desul/atomics/Fetch_Op_SYCL.hpp> #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp index 69ed8bcb9fd8a46fd609e6eb42de0dc4a8fbac44..68622758d8e02fb6f583388fb5e8e1d8f7978f09 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_CUDA.hpp @@ -69,56 +69,56 @@ inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } // clang-format on -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, TYPE) \ template <class MemoryOrder> \ - __device__ TYPE device_atomic_fetch_##OP( \ + __device__ TYPE device_atomic_##FETCH_OP( \ TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ __threadfence(); \ TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ + device_atomic_##FETCH_OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ __threadfence(); \ return return_val; \ } \ template <class MemoryOrder> \ - __device__ TYPE device_atomic_fetch_##OP( \ + __device__ TYPE device_atomic_##FETCH_OP( \ TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + return device_atomic_##FETCH_OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ } -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(FETCH_OP) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, int) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, unsigned int) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, unsigned long long) #ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, float) +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(FETCH_OP) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, float) #else -#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(OP, double) +#define DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(FETCH_OP) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, float) \ + DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(FETCH_OP, double) #endif -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_min) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_max) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_and) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_or) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_xor) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(fetch_add) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_add) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(fetch_sub) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_sub) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_inc) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(fetch_dec) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(fetch_inc_mod, unsigned int) +DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP(fetch_dec_mod, unsigned int) #undef DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT #undef DESUL_IMPL_CUDA_DEVICE_ATOMIC_FETCH_OP_INTEGRAL diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp index a94ff8ef1875ed5ce96e7440e2c41292878bb63c..530195a832712a9ec2ff950c745052800d5e0045 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_Generic.hpp @@ -18,38 +18,38 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { namespace Impl { -#define DESUL_IMPL_ATOMIC_FETCH_OP(ANNOTATION, HOST_OR_DEVICE, OP) \ - template <class T, class MemoryOrder, class MemoryScope> \ - ANNOTATION T HOST_OR_DEVICE##_atomic_fetch_##OP( \ - T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ - return HOST_OR_DEVICE##_atomic_fetch_oper( \ - OP##_operator<T, const T>(), dest, val, order, scope); \ - } \ - template <class T, class MemoryOrder, class MemoryScope> \ - ANNOTATION T HOST_OR_DEVICE##_atomic_##OP##_fetch( \ - T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ - return HOST_OR_DEVICE##_atomic_oper_fetch( \ - OP##_operator<T, const T>(), dest, val, order, scope); \ +#define DESUL_IMPL_ATOMIC_FETCH_OP(ANNOTATION, HOST_OR_DEVICE, FETCH_OP, OP_FETCH) \ + template <class T, class MemoryOrder, class MemoryScope> \ + ANNOTATION T HOST_OR_DEVICE##_atomic_##FETCH_OP( \ + T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ + return HOST_OR_DEVICE##_atomic_fetch_oper( \ + OP_FETCH##_operator<T, const T>(), dest, val, order, scope); \ + } \ + template <class T, class MemoryOrder, class MemoryScope> \ + ANNOTATION T HOST_OR_DEVICE##_atomic_##OP_FETCH( \ + T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ + return HOST_OR_DEVICE##_atomic_oper_fetch( \ + OP_FETCH##_operator<T, const T>(), dest, val, order, scope); \ } -#define DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(OP) \ - DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_HOST_FUNCTION, host, OP) \ - DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_DEVICE_FUNCTION, device, OP) - -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(add) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(sub) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(max) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(min) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(mul) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(div) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(mod) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(and) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(or) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(xor) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(nand) - -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(inc_mod) -DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(dec_mod) +#define DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(FETCH_OP, OP_FETCH) \ + DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_HOST_FUNCTION, host, FETCH_OP, OP_FETCH) \ + DESUL_IMPL_ATOMIC_FETCH_OP(DESUL_IMPL_DEVICE_FUNCTION, device, FETCH_OP, OP_FETCH) + +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_add, add_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_sub, sub_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_max, max_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_min, min_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_mul, mul_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_div, div_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_mod, mod_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_and, and_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_or, or_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_xor, xor_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_nand, nand_fetch) + +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_inc_mod, inc_mod_fetch) +DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(fetch_dec_mod, dec_mod_fetch) #undef DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE #undef DESUL_IMPL_ATOMIC_FETCH_OP @@ -59,13 +59,13 @@ DESUL_IMPL_ATOMIC_FETCH_OP_HOST_AND_DEVICE(dec_mod) ANNOTATION T HOST_OR_DEVICE##_atomic_fetch_##OP( \ T* const dest, const unsigned int val, MemoryOrder order, MemoryScope scope) { \ return HOST_OR_DEVICE##_atomic_fetch_oper( \ - OP##_operator<T, const unsigned int>(), dest, val, order, scope); \ + OP##_fetch_operator<T, const unsigned int>(), dest, val, order, scope); \ } \ template <class T, class MemoryOrder, class MemoryScope> \ ANNOTATION T HOST_OR_DEVICE##_atomic_##OP##_fetch( \ T* const dest, const unsigned int val, MemoryOrder order, MemoryScope scope) { \ return HOST_OR_DEVICE##_atomic_oper_fetch( \ - OP##_operator<T, const unsigned int>(), dest, val, order, scope); \ + OP##_fetch_operator<T, const unsigned int>(), dest, val, order, scope); \ } #define DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT_HOST_AND_DEVICE(OP) \ @@ -78,19 +78,21 @@ DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT_HOST_AND_DEVICE(rshift) #undef DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT_HOST_AND_DEVICE #undef DESUL_IMPL_ATOMIC_FETCH_OP_SHIFT -#define DESUL_IMPL_ATOMIC_LOAD_AND_STORE(ANNOTATION, HOST_OR_DEVICE) \ - template <class T, class MemoryOrder, class MemoryScope> \ - ANNOTATION T HOST_OR_DEVICE##_atomic_load( \ - const T* const dest, MemoryOrder order, MemoryScope scope) { \ - return HOST_OR_DEVICE##_atomic_fetch_oper( \ - load_operator<T, const T>(), const_cast<T*>(dest), T(), order, scope); \ - } \ - \ - template <class T, class MemoryOrder, class MemoryScope> \ - ANNOTATION void HOST_OR_DEVICE##_atomic_store( \ - T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ - (void)HOST_OR_DEVICE##_atomic_fetch_oper( \ - store_operator<T, const T>(), dest, val, order, scope); \ +// NOTE: using atomic_oper_fetch in the fallback implementation of atomic_store to avoid +// reading potentially uninitialized values which would yield undefined behavior. +#define DESUL_IMPL_ATOMIC_LOAD_AND_STORE(ANNOTATION, HOST_OR_DEVICE) \ + template <class T, class MemoryOrder, class MemoryScope> \ + ANNOTATION T HOST_OR_DEVICE##_atomic_load( \ + const T* const dest, MemoryOrder order, MemoryScope scope) { \ + return HOST_OR_DEVICE##_atomic_fetch_oper( \ + load_fetch_operator<T, const T>(), const_cast<T*>(dest), T(), order, scope); \ + } \ + \ + template <class T, class MemoryOrder, class MemoryScope> \ + ANNOTATION void HOST_OR_DEVICE##_atomic_store( \ + T* const dest, const T val, MemoryOrder order, MemoryScope scope) { \ + (void)HOST_OR_DEVICE##_atomic_oper_fetch( \ + store_fetch_operator<T, const T>(), dest, val, order, scope); \ } DESUL_IMPL_ATOMIC_LOAD_AND_STORE(DESUL_IMPL_HOST_FUNCTION, host) diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp index e9c749809de5910368a7694e354d50874e58fa9d..8d9bd868250649fdeae65b15174c1983a5c48600 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_HIP.hpp @@ -9,99 +9,108 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifndef DESUL_ATOMICS_FECH_OP_HIP_HPP_ #define DESUL_ATOMICS_FECH_OP_HIP_HPP_ +#include <desul/atomics/Adapt_HIP.hpp> + namespace desul { namespace Impl { -// clang-format off -inline __device__ int device_atomic_fetch_add( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_add( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_add(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ float device_atomic_fetch_add( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } -inline __device__ double device_atomic_fetch_add( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, val); } - -inline __device__ int device_atomic_fetch_sub( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_sub( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_sub(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ float device_atomic_fetch_sub( float* ptr, float val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } -inline __device__ double device_atomic_fetch_sub( double* ptr, double val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -val); } - -inline __device__ int device_atomic_fetch_min( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_min( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_min(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMin(ptr, val); } - -inline __device__ int device_atomic_fetch_max( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_max( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_max(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicMax(ptr, val); } - -inline __device__ int device_atomic_fetch_and( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_and( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_and(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAnd(ptr, val); } - -inline __device__ int device_atomic_fetch_or ( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned int device_atomic_fetch_or ( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_or (unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicOr (ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, T) \ + template <class MemoryOrder, class MemoryScope> \ + __device__ inline T device_atomic_fetch_##OP( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_##OP(ptr, \ + val, \ + HIPMemoryOrder<MemoryOrder>::value, \ + HIPMemoryScope<MemoryScope>::value); \ + } -inline __device__ int device_atomic_fetch_xor( int* ptr, int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_xor( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } -inline __device__ unsigned long long device_atomic_fetch_xor(unsigned long long* ptr, unsigned long long val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicXor(ptr, val); } +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, long long) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned int) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, unsigned long long) + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, float) \ + DESUL_IMPL_HIP_ATOMIC_FETCH_OP(OP, double) + +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(add) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(min) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(max) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(and) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(or) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL(xor) +DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(add) +// atomic min/max gives the wrong results (tested with ROCm 6.0 on Frontier) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(min) +// DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT(max) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_FLOATING_POINT +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP_INTEGRAL +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_OP + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(T) \ + template <class MemoryOrder, class MemoryScope> \ + __device__ inline T device_atomic_fetch_sub( \ + T* ptr, T val, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -val, \ + HIPMemoryOrder<MemoryOrder>::value, \ + HIPMemoryScope<MemoryScope>::value); \ + } -inline __device__ int device_atomic_fetch_inc( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_inc( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_inc(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, 1ull); } +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(unsigned long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(float) +DESUL_IMPL_HIP_ATOMIC_FETCH_SUB(double) + +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_SUB + +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC(T) \ + template <class MemoryOrder, class MemoryScope> \ + __device__ inline T device_atomic_fetch_inc(T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + 1, \ + HIPMemoryOrder<MemoryOrder>::value, \ + HIPMemoryScope<MemoryScope>::value); \ + } \ + template <class MemoryOrder, class MemoryScope> \ + __device__ inline T device_atomic_fetch_dec(T* ptr, MemoryOrder, MemoryScope) { \ + return __hip_atomic_fetch_add(ptr, \ + -1, \ + HIPMemoryOrder<MemoryOrder>::value, \ + HIPMemoryScope<MemoryScope>::value); \ + } -inline __device__ int device_atomic_fetch_dec( int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1 ); } -inline __device__ unsigned int device_atomic_fetch_dec( unsigned int* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicSub(ptr, 1u ); } -inline __device__ unsigned long long device_atomic_fetch_dec(unsigned long long* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicAdd(ptr, -1 ); } +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(long long) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC(unsigned long long) -inline __device__ unsigned int device_atomic_fetch_inc_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicInc(ptr, val); } -inline __device__ unsigned int device_atomic_fetch_dec_mod( unsigned int* ptr, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { return atomicDec(ptr, val); } -// clang-format on +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, TYPE) \ +#define DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MEMORY_SCOPE, MEMORY_SCOPE_STRING_LITERAL) \ template <class MemoryOrder> \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeDevice) { \ - __threadfence(); \ - TYPE return_val = \ - device_atomic_fetch_##OP(ptr, val, MemoryOrderRelaxed(), MemoryScopeDevice()); \ - __threadfence(); \ - return return_val; \ + __device__ inline unsigned int device_atomic_fetch_inc_mod( \ + unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_inc32( \ + ptr, val, HIPMemoryOrder<MemoryOrder>::value, MEMORY_SCOPE_STRING_LITERAL); \ } \ template <class MemoryOrder> \ - __device__ TYPE device_atomic_fetch_##OP( \ - TYPE* ptr, TYPE val, MemoryOrder, MemoryScopeCore) { \ - return device_atomic_fetch_##OP(ptr, val, MemoryOrder(), MemoryScopeDevice()); \ + __device__ inline unsigned int device_atomic_fetch_dec_mod( \ + unsigned int* ptr, unsigned int val, MemoryOrder, MEMORY_SCOPE) { \ + return __builtin_amdgcn_atomic_dec32( \ + ptr, val, HIPMemoryOrder<MemoryOrder>::value, MEMORY_SCOPE_STRING_LITERAL); \ } -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned int) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, unsigned long long) - -#define DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(OP) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, float) \ - DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(OP, double) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(min) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(max) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(and) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(or) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(xor) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(add) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT(sub) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(sub) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(inc) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL(dec) - -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(inc_mod, unsigned int) -DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP(dec_mod, unsigned int) +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeCore, "workgroup") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeDevice, "agent") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeNode, "") +DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD(MemoryScopeSystem, "") -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_FLOATING_POINT -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP_INTEGRAL -#undef DESUL_IMPL_HIP_DEVICE_ATOMIC_FETCH_OP +#undef DESUL_IMPL_HIP_ATOMIC_FETCH_INC_MOD } // namespace Impl } // namespace desul diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ab570ac5787a787758d918de5c70e5b916c4c141 --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -0,0 +1,431 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ +#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ + +#include <algorithm> // min, max +#include <desul/atomics/Common.hpp> +#include <type_traits> + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +template <class T> +inline constexpr bool is_openacc_integral_type_v = + std::is_same_v<T, int> || std::is_same_v<T, unsigned int> || + std::is_same_v<T, unsigned long long>; + +template <class T> +inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v<T, float> || +#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL + std::is_same_v<T, double> || +#endif + is_openacc_integral_type_v<T>; + +#else + +template <class T> +inline constexpr bool is_openacc_integral_type_v = std::is_integral_v<T>; + +template <class T> +inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v<T>; + +#endif + +//<editor-fold +// desc="device_atomic_fetch_{add,sub,mul,div,lshift,rshift,mod,max,min,and,or,xor}"> +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_add( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_inc( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += T(1); + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_sub( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_dec( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= T(1); + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_mul( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr *= val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_div( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr /= val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_lshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr << val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_rshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr >> val; + } + return old; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_max( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; + old = atomicMax(ptr, val); + return old; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_fetch_min( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + int old; + old = atomicMin(ptr, val); + return old; +} +#endif + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_and( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr &= val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_or( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr |= val; + } + return old; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_fetch_xor( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr ^= val; + } + return old; +} +//</editor-fold> + +//<editor-fold +// desc="device_atomic_{add,sub,mul,div,lshift,rshift,mod,max,min,and,or,xor}_fetch"> +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_add_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_inc_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_sub_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_dec_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_mul_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr *= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_div_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr /= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_lshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr << val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_rshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr >> val; + tmp = *ptr; + } + return tmp; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_max_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMax(ptr, val); + tmp = std::max(tmp, val); + return tmp; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_min_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMin(ptr, val); + tmp = std::min(tmp, val); + return tmp; +} +#endif + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_and_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr &= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_or_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr |= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_integral_type_v<T>, T> device_atomic_xor_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr ^= val; + tmp = *ptr; + } + return tmp; +} +//</editor-fold> + +//<editor-fold desc="device_atomic_{store,load}"> +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); + } +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_load( + const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} + +#pragma acc routine seq +template <class T> +std::enable_if_t<is_openacc_arithmetic_type_v<T>, T> device_atomic_load( + const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} +//</editor-fold> + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp index fef10222e34ed056b89dc0cc8babfb91fd504d00..fa71477c29962b9604fa7e635d7447c0f6dfd959 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp @@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);) DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);) } + +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { @@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) { @@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc } // Fetch_Oper atomics: return value before operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, const unsigned int val, @@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, const unsigned int val, @@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, } // Oper Fetch atomics: return value after operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, const unsigned int val, @@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, const unsigned int val, @@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, // Other atomics +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, MemoryOrder order, @@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_store(T* const dest, const T val, @@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_add(T* const dest, const T val, @@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, const T val, @@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, const T val, @@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_div(T* const dest, const T val, @@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_min(T* const dest, const T val, @@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_max(T* const dest, const T val, @@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, MemoryOrder order, @@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, MemoryOrder order, @@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, MemoryOrder order, @@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, MemoryOrder order, @@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION T atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, MemoryOrder order, @@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class MemoryOrder, class MemoryScope> DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, MemoryOrder order, @@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, } // FIXME +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class SuccessMemoryOrder, class FailureMemoryOrder, @@ -387,6 +434,7 @@ DESUL_INLINE_FUNCTION bool atomic_compare_exchange_strong( } } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template <class T, class SuccessMemoryOrder, class FailureMemoryOrder, diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp index 8216f9a797c94b45be905fb79a53a68342c73f68..e1170ed2aae81a81bf4d3208153464327e6eca55 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp @@ -57,14 +57,35 @@ void finalize_lock_arrays_sycl(sycl::queue q); * \brief This global variable in SYCL space is what kernels use to get access * to the lock arrays. * - * There is only one single instance of this global variable for the entire - * executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose - * declaration here must be extern). This one instance will be initialized - * by initialize_host_sycl_lock_arrays and need not be modified afterwards. + * When relocatable device code is enabled, there is only one single instance of this + * global variable for the entire executable, whose definition will be in + * Kokkos_SYCL_Locks.cpp (and whose declaration here must then be extern). This one + * instance will be initialized by initialize_host_sycl_lock_arrays and need not be + * modified afterwards. + * + * When relocatable device code is disabled, an instance of this variable will be + * created in every translation unit that sees this header file (we make this clear by + * marking it static, meaning no other translation unit can link to it). Since the + * Kokkos_SYCL_Locks.cpp translation unit cannot initialize the instances in other + * translation units, we must update this SYCL global variable based on the Host global + * variable prior to running any kernels that will use it. That is the purpose of the + * ensure_sycl_lock_arrays_on_device function. */ -SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global<int32_t*> + SYCL_SPACE_ATOMIC_LOCKS_DEVICE; -SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global<int32_t*> + SYCL_SPACE_ATOMIC_LOCKS_NODE; #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue q) { + static bool once = [&q]() { +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); +#else + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); +#endif + return true; + }(); + (void)once; +} + #else // not supported template <typename /*AlwaysInt*/ = int> @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) { inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue) { +} + #endif } // namespace Impl + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {} +#else +static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) { + Impl::copy_sycl_lock_arrays_to_device(q); +} +#endif + } // namespace desul #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp index cb97f4a906db19f16ca66ad9efba96b2b0908351..b6a399100b177cd1f7d1c6ee08a832e85ceb7d09 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp @@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_HIP_ATOMICS #include <desul/atomics/Lock_Based_Fetch_Op_HIP.hpp> #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include <desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp> +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include <desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp> #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d4dd74588bda11dace4563ab8b6929781996a3ae --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -0,0 +1,81 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ + +#include <desul/atomics/Common.hpp> +#include <desul/atomics/Lock_Array.hpp> +#include <desul/atomics/Thread_Fence.hpp> +#include <type_traits> + +namespace desul { +namespace Impl { + +template <class Oper, + class T, + class MemoryOrder, + class MemoryScope, + // equivalent to: + // requires !atomic_always_lock_free(sizeof(T)) + std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0> +inline T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t<const T> val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +template <class Oper, + class T, + class MemoryOrder, + class MemoryScope, + // equivalent to: + // requires !atomic_always_lock_free(sizeof(T)) + std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0> +inline T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t<const T> val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp index 8774a6e96eb5f41443a0c09295c48d3944623924..12d9de1604ad0c0e7f871d6694a52894cc129a2f 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp @@ -32,7 +32,11 @@ T device_atomic_fetch_oper(const Oper& op, // This is a way to avoid deadlock in a subgroup T return_val; int done = 0; +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto sg = sycl::ext::oneapi::this_work_item::get_sub_group(); +#else auto sg = sycl::ext::oneapi::experimental::this_sub_group(); +#endif using sycl::ext::oneapi::group_ballot; using sycl::ext::oneapi::sub_group_mask; sub_group_mask active = group_ballot(sg, 1); @@ -68,7 +72,11 @@ T device_atomic_oper_fetch(const Oper& op, // This is a way to avoid deadlock in a subgroup T return_val; int done = 0; +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto sg = sycl::ext::oneapi::this_work_item::get_sub_group(); +#else auto sg = sycl::ext::oneapi::experimental::this_sub_group(); +#endif using sycl::ext::oneapi::group_ballot; using sycl::ext::oneapi::sub_group_mask; sub_group_mask active = group_ballot(sg, 1); diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp index 3a14b93d323033d82f7a260a4a6c2a233d9dccdc..d11beb0c805082f73d21d30df75c75e5981c6e17 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp @@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_OPENMP_ATOMICS #endif +#if defined(DESUL_ATOMICS_ENABLE_OPENACC) +#define DESUL_HAVE_OPENACC_ATOMICS +#endif + // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS @@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #endif +#if defined(DESUL_HAVE_OPENACC_ATOMICS) +#include <openacc.h> +#ifdef __NVCOMPILER +// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or +// on device with NVHPC. We use the device implementation on both sides. +#define DESUL_IF_ON_DEVICE(CODE) \ + { DESUL_IMPL_STRIP_PARENS(CODE) } +#define DESUL_IF_ON_HOST(CODE) \ + {} +#else +#define DESUL_IF_ON_DEVICE(CODE) \ + if constexpr (acc_on_device(acc_device_not_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#define DESUL_IF_ON_HOST(CODE) \ + if constexpr (acc_on_device(acc_device_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#endif +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq") +#else +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE +#endif + #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp index be90cdbbd86ffc42a674eacc5f1cea70dc42c8cf..1f5159c4f8b01b5f7f8643286abdc01ce74760c2 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Operator_Function_Objects.hpp @@ -18,7 +18,7 @@ namespace desul { namespace Impl { template <class Scalar1, class Scalar2> -struct max_operator { +struct max_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return (val1 > val2 ? val1 : val2); @@ -30,7 +30,7 @@ struct max_operator { }; template <class Scalar1, class Scalar2> -struct min_operator { +struct min_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return (val1 < val2 ? val1 : val2); @@ -70,55 +70,55 @@ constexpr DESUL_FUNCTION } template <class Scalar1, class Scalar2> -struct add_operator { +struct add_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 + val2; } }; template <class Scalar1, class Scalar2> -struct sub_operator { +struct sub_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 - val2; } }; template <class Scalar1, class Scalar2> -struct mul_operator { +struct mul_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 * val2; } }; template <class Scalar1, class Scalar2> -struct div_operator { +struct div_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 / val2; } }; template <class Scalar1, class Scalar2> -struct mod_operator { +struct mod_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 % val2; } }; template <class Scalar1, class Scalar2> -struct and_operator { +struct and_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 & val2; } }; template <class Scalar1, class Scalar2> -struct or_operator { +struct or_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 | val2; } }; template <class Scalar1, class Scalar2> -struct xor_operator { +struct xor_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 ^ val2; } }; template <class Scalar1, class Scalar2> -struct nand_operator { +struct nand_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return ~(val1 & val2); @@ -126,7 +126,7 @@ struct nand_operator { }; template <class Scalar1, class Scalar2> -struct lshift_operator { +struct lshift_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 << val2; @@ -134,7 +134,7 @@ struct lshift_operator { }; template <class Scalar1, class Scalar2> -struct rshift_operator { +struct rshift_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 >> val2; @@ -142,7 +142,7 @@ struct rshift_operator { }; template <class Scalar1, class Scalar2> -struct inc_mod_operator { +struct inc_mod_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return ((val1 >= val2) ? Scalar1(0) : val1 + Scalar1(1)); @@ -150,7 +150,7 @@ struct inc_mod_operator { }; template <class Scalar1, class Scalar2> -struct dec_mod_operator { +struct dec_mod_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return (((val1 == Scalar1(0)) | (val1 > val2)) ? val2 : (val1 - Scalar1(1))); @@ -158,13 +158,13 @@ struct dec_mod_operator { }; template <class Scalar1, class Scalar2> -struct store_operator { +struct store_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1&, const Scalar2& val2) { return val2; } }; template <class Scalar1, class Scalar2> -struct load_operator { +struct load_fetch_operator { DESUL_FORCEINLINE_FUNCTION static Scalar1 apply(const Scalar1& val1, const Scalar2&) { return val1; } }; diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp index 24078aae07fe90196a77cba4e3114fe086dbf049..6a741f6d478c2cad5a1740c093d48dadbcb75ce2 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include <desul/atomics/Thread_Fence_OpenMP.hpp> #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include <desul/atomics/Thread_Fence_OpenACC.hpp> +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include <desul/atomics/Thread_Fence_SYCL.hpp> #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a5c8aa1c8a72595419795f77e0bc50982a1da99f --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp @@ -0,0 +1,25 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ +#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ + +namespace desul { +namespace Impl { + +#pragma acc routine seq +template <class MemoryOrder, class MemoryScope> +void device_atomic_thread_fence(MemoryOrder, MemoryScope) { + // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence + // operations. +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp index 9e84c60e41a541df54b4aa31eef9a039815f6705..6660c76e11a36ff1fb63b63cd0ace32be1ccbcce 100644 --- a/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul::Impl { +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION SYCL_EXTERNAL sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE; +#endif int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; @@ -31,19 +33,7 @@ void init_lock_arrays_sycl<int>(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = sycl::malloc_host<int32_t>(SYCL_SPACE_ATOMIC_MASK + 1, q); - // FIXME_SYCL Once supported, the following should be replaced by - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, - // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, - // sizeof(int32_t*)); - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, - // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, - // sizeof(int32_t*)); - auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; - auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; - q.single_task([=] { - SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; - SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; - }); + copy_sycl_lock_arrays_to_device(q); q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, 0, @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl<int>(sycl::queue q) { sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION + copy_sycl_lock_arrays_to_device(q); +#endif } -} // namespace desul::Impl +} // namespace desul::Impl #endif diff --git a/packages/kokkos/tpls/gtest/gtest/gtest.h b/packages/kokkos/tpls/gtest/gtest/gtest.h index c17c9ab3fc2295eded01956cd7820a33a1accb2a..3a1aa446d49bff69511b9fd6764479b4d11493af 100644 --- a/packages/kokkos/tpls/gtest/gtest/gtest.h +++ b/packages/kokkos/tpls/gtest/gtest/gtest.h @@ -4770,11 +4770,10 @@ class NeverThrown { #endif // GTEST_HAS_RTTI #define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ - catch (typename std::conditional< \ - std::is_same<typename std::remove_cv<typename std::remove_reference< \ - expected_exception>::type>::type, \ - std::exception>::value, \ - const ::testing::internal::NeverThrown&, const std::exception&>::type \ + catch (std::conditional_t< \ + std::is_same_v<std::remove_cv_t<std::remove_reference_t< \ + expected_exception>>, std::exception>, \ + const ::testing::internal::NeverThrown&, const std::exception&> \ e) { \ gtest_msg.value = "Expected: " #statement \ " throws an exception of type " #expected_exception \ @@ -4910,7 +4909,7 @@ class NeverThrown { class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ : public parent_class { \ public: \ - GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() { (void)test_info_; }\ ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ test_name)); \ diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp index ab1561bd47fa57f31004f3c8e56e361eb55c4c76..25389a2fa5e7be9c2f3bdb35d0a5ff4a746b027a 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp @@ -27,165 +27,165 @@ namespace detail { // For no unique address emulation, this is the case taken when neither are empty. // For real `[[no_unique_address]]`, this case is always taken. -template <class _T, class _U, class _Enable = void> struct __compressed_pair { - _MDSPAN_NO_UNIQUE_ADDRESS _T __t_val; - _MDSPAN_NO_UNIQUE_ADDRESS _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; +template <class _T1, class _T2, class _Enable = void> struct __compressed_pair { + _MDSPAN_NO_UNIQUE_ADDRESS _T1 __t1_val{}; + _MDSPAN_NO_UNIQUE_ADDRESS _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template <class _TLike, class _ULike> - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : __t_val((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template <class _T1Like, class _T2Like> + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : __t1_val((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) // First empty. -template <class _T, class _U> +template <class _T1, class _T2> struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>> - : private _T { - _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { - return *static_cast<_T *>(this); + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && !_MDSPAN_TRAIT(std::is_empty, _T2)>> + : private _T1 { + _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { + return *static_cast<_T1 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return *static_cast<_T const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return *static_cast<_T1 const *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template <class _TLike, class _ULike> - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _T((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template <class _T1Like, class _T2Like> + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T1((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; // Second empty. -template <class _T, class _U> +template <class _T1, class _T2> struct __compressed_pair< - _T, _U, - std::enable_if_t<!_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>> - : private _U { - _T __t_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; + _T1, _T2, + std::enable_if_t<!_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>> + : private _T2 { + _T1 __t1_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { - return *static_cast<_U *>(this); + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { + return *static_cast<_T2 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return *static_cast<_U const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return *static_cast<_T2 const *>(this); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; + ~__compressed_pair() = default; - template <class _TLike, class _ULike> - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _U((_ULike &&) __u), __t_val((_TLike &&) __t) {} + template <class _T1Like, class _T2Like> + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T2((_T2Like &&) __t2), __t1_val((_T1Like &&) __t1) {} }; // Both empty. -template <class _T, class _U> +template <class _T1, class _T2> struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>> + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>> // We need to use the __no_unique_address_emulation wrapper here to avoid // base class ambiguities. #ifdef _MDSPAN_COMPILER_MSVC // MSVC doesn't allow you to access public static member functions of a type // when you *happen* to privately inherit from that type. - : protected __no_unique_address_emulation<_T, 0>, - protected __no_unique_address_emulation<_U, 1> + : protected __no_unique_address_emulation<_T1, 0>, + protected __no_unique_address_emulation<_T2, 1> #else - : private __no_unique_address_emulation<_T, 0>, - private __no_unique_address_emulation<_U, 1> + : private __no_unique_address_emulation<_T1, 0>, + private __no_unique_address_emulation<_T2, 1> #endif { - using __first_base_t = __no_unique_address_emulation<_T, 0>; - using __second_base_t = __no_unique_address_emulation<_U, 1>; + using __first_base_t = __no_unique_address_emulation<_T1, 0>; + using __second_base_t = __no_unique_address_emulation<_T2, 1>; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return this->__second_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { return this->__second_base_t::__ref(); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template <class _TLike, class _ULike> - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) noexcept - : __first_base_t(_T((_TLike &&) __t)), - __second_base_t(_U((_ULike &&) __u)) + ~__compressed_pair() = default; + template <class _T1Like, class _T2Like> + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) noexcept + : __first_base_t(_T1((_T1Like &&) __t1)), + __second_base_t(_T2((_T2Like &&) __t2)) { } }; diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index d35e201cebd2bd8d0b1b99e6409e618a440c7a68..e8cacf40d601db083f11aa7a71bc14d8a3db8e9b 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -35,10 +35,17 @@ #define MDSPAN_CXX_STD_14 201402L #define MDSPAN_CXX_STD_17 201703L #define MDSPAN_CXX_STD_20 202002L +// Note GCC has not updated this in version 13 +#ifdef __clang__ +#define MDSPAN_CXX_STD_23 202302L +#else +#define MDSPAN_CXX_STD_23 202100L +#endif #define MDSPAN_HAS_CXX_14 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14) #define MDSPAN_HAS_CXX_17 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_17) #define MDSPAN_HAS_CXX_20 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_20) +#define MDSPAN_HAS_CXX_23 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_23) static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or later."); @@ -198,7 +205,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef _MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION -# if (!defined(__NVCC__) || (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) && \ +# if (!defined(__NVCC__) || (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10 >= 1170)) && \ ((defined(__cpp_deduction_guides) && __cpp_deduction_guides >= 201703) || \ (!defined(__cpp_deduction_guides) && MDSPAN_HAS_CXX_17)) # define _MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION 1 @@ -224,7 +231,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef MDSPAN_CONDITIONAL_EXPLICIT -# if MDSPAN_HAS_CXX_20 && !defined(_MDSPAN_COMPILER_MSVC) +# if MDSPAN_HAS_CXX_20 # define MDSPAN_CONDITIONAL_EXPLICIT(COND) explicit(COND) # else # define MDSPAN_CONDITIONAL_EXPLICIT(COND) @@ -233,7 +240,13 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #ifndef MDSPAN_USE_BRACKET_OPERATOR # if defined(__cpp_multidimensional_subscript) -# define MDSPAN_USE_BRACKET_OPERATOR 1 +// The following if/else is necessary to workaround a clang issue +// relative to using a parameter pack inside a bracket operator in C++2b/C++23 mode +# if defined(_MDSPAN_COMPILER_CLANG) && ((__clang_major__ == 15) || (__clang_major__ == 16)) +# define MDSPAN_USE_BRACKET_OPERATOR 0 +# else +# define MDSPAN_USE_BRACKET_OPERATOR 1 +# endif # else # define MDSPAN_USE_BRACKET_OPERATOR 0 # endif diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp index 0dd31c4cd0aacb38b1fff605f6101059195e2d90..d58d37732dda6896b45cc69e10d5c6dc5855db2b 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp @@ -16,12 +16,15 @@ #pragma once #include "dynamic_extent.hpp" +#include "utility.hpp" #ifdef __cpp_lib_span #include <span> #endif #include <array> +#include <type_traits> +#include <cassert> #include <cinttypes> namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -30,6 +33,7 @@ namespace detail { // Function used to check compatibility of extents in converting constructor // can't be a private member function for some reason. template <size_t... Extents, size_t... OtherExtents> +MDSPAN_INLINE_FUNCTION static constexpr std::integral_constant<bool, false> __check_compatible_extents( std::integral_constant<bool, false>, std::integer_sequence<size_t, Extents...>, @@ -46,6 +50,7 @@ struct __compare_extent_compatible : std::integral_constant<bool, {}; template <size_t... Extents, size_t... OtherExtents> +MDSPAN_INLINE_FUNCTION static constexpr std::integral_constant< bool, _MDSPAN_FOLD_AND(__compare_extent_compatible<Extents, OtherExtents>::value)> __check_compatible_extents( @@ -55,6 +60,14 @@ __check_compatible_extents( return {}; } +template<class IndexType, class ... Arguments> +MDSPAN_INLINE_FUNCTION +static constexpr bool are_valid_indices() { + return + _MDSPAN_FOLD_AND(std::is_convertible<Arguments, IndexType>::value) && + _MDSPAN_FOLD_AND(std::is_nothrow_constructible<IndexType, Arguments>::value); +} + // ------------------------------------------------------------------ // ------------ static_array ---------------------------------------- // ------------------------------------------------------------------ @@ -140,7 +153,8 @@ struct index_sequence_scan_impl<R, FirstVal, Values...> { template <size_t R, size_t FirstVal> struct index_sequence_scan_impl<R, FirstVal> { -#if defined(__NVCC__) || defined(__NVCOMPILER) +#if defined(__NVCC__) || defined(__NVCOMPILER) || \ + defined(_MDSPAN_COMPILER_INTEL) // NVCC warns about pointless comparison with 0 for R==0 and r being const // evaluatable and also 0. MDSPAN_INLINE_FUNCTION @@ -167,7 +181,7 @@ template <> struct index_sequence_scan_impl<0> { // all static values. template <class T, size_t N> struct possibly_empty_array { - T vals[N]; + T vals[N]{}; MDSPAN_INLINE_FUNCTION constexpr T &operator[](size_t r) { return vals[r]; } MDSPAN_INLINE_FUNCTION @@ -251,12 +265,17 @@ public: #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, - /* requires */ (N == m_size_dynamic)) + /* requires */ (N == m_size_dynamic && N > 0)) MDSPAN_INLINE_FUNCTION constexpr maybe_static_array(const std::span<T, N> &vals) { for (size_t r = 0; r < N; r++) m_dyn_vals[r] = static_cast<TDynamic>(vals[r]); } + + MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, + /* requires */ (N == m_size_dynamic && N == 0)) + MDSPAN_INLINE_FUNCTION + constexpr maybe_static_array(const std::span<T, N> &) : m_dyn_vals{} {} #endif // constructors from all values @@ -423,9 +442,9 @@ public: class OtherIndexType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && + _MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, - OtherIndexType) && + const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -436,8 +455,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class OtherIndexType, size_t N, /* requires */ - (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) && + (_MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -454,6 +473,7 @@ private: size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>, std::integral_constant<size_t, R>, const OtherExtents &exts, @@ -468,6 +488,7 @@ private: size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>, std::integral_constant<size_t, R>, const OtherExtents &exts, @@ -481,6 +502,7 @@ private: size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant<size_t, DynCount>, std::integral_constant<size_t, R>, const OtherExtents &, @@ -491,17 +513,20 @@ private: public: // Converting constructor from other extents specializations - MDSPAN_TEMPLATE_REQUIRES( - class OtherIndexType, size_t... OtherExtents, - /* requires */ - ( - /* multi-stage check to protect from invalid pack expansion when sizes - don't match? */ - decltype(detail::__check_compatible_extents( - std::integral_constant<bool, sizeof...(Extents) == - sizeof...(OtherExtents)>{}, + MDSPAN_TEMPLATE_REQUIRES( + class OtherIndexType, size_t... OtherExtents, + /* requires */ + ( + /* multi-stage check to protect from invalid pack expansion when sizes + don't match? */ + decltype(detail::__check_compatible_extents( + // using: sizeof...(Extents) == sizeof...(OtherExtents) as the second argument fails with MSVC+NVCC with some obscure expansion error + // MSVC: 19.38.33133 NVCC: 12.0 + std::integral_constant<bool, extents<int, Extents...>::rank() == extents<int, OtherExtents...>::rank()>{}, std::integer_sequence<size_t, Extents...>{}, - std::integer_sequence<size_t, OtherExtents...>{}))::value)) + std::integer_sequence<size_t, OtherExtents...>{}))::value + ) + ) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || @@ -518,10 +543,9 @@ public: MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const extents &lhs, const extents<OtherIndexType, OtherExtents...> &rhs) noexcept { - bool value = true; - for (size_type r = 0; r < m_rank; r++) - value &= rhs.extent(r) == lhs.extent(r); - return value; + return + rank() == extents<OtherIndexType, OtherExtents...>::rank() && + detail::rankwise_equal(detail::with_rank<rank()>{}, rhs, lhs, detail::extent); } #if !(MDSPAN_HAS_CXX_20) @@ -570,7 +594,7 @@ using dextents = typename detail::__make_dextents<IndexType, Rank>::type; template <class... IndexTypes> extents(IndexTypes...) -> extents<size_t, - size_t((IndexTypes(), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent))...>; + ((void) sizeof(IndexTypes), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent)...>; #endif // Helper type traits for identifying a class as extents. @@ -590,5 +614,80 @@ static #endif constexpr bool __is_extents_v = __is_extents<T>::value; +template<class InputIndexType, class ExtentsIndexType> +MDSPAN_INLINE_FUNCTION +constexpr void +check_lower_bound(InputIndexType user_index, + ExtentsIndexType /* current_extent */, + std::true_type /* is_signed */) +{ + (void) user_index; // prevent unused variable warning +#ifdef _MDSPAN_DEBUG + assert(static_cast<ExtentsIndexType>(user_index) >= 0); +#endif +} + +template<class InputIndexType, class ExtentsIndexType> +MDSPAN_INLINE_FUNCTION +constexpr void +check_lower_bound(InputIndexType /* user_index */, + ExtentsIndexType /* current_extent */, + std::false_type /* is_signed */) +{} + +template<class InputIndexType, class ExtentsIndexType> +MDSPAN_INLINE_FUNCTION +constexpr void +check_upper_bound(InputIndexType user_index, + ExtentsIndexType current_extent) +{ + (void) user_index; // prevent unused variable warnings + (void) current_extent; +#ifdef _MDSPAN_DEBUG + assert(static_cast<ExtentsIndexType>(user_index) < current_extent); +#endif +} + +// Returning true to use AND fold instead of comma +// CPP14 mode doesn't like the use of void expressions +// with the way the _MDSPAN_FOLD_AND is set up +template<class InputIndex, class ExtentsIndexType> +MDSPAN_INLINE_FUNCTION +constexpr bool +check_one_index(InputIndex user_index, + ExtentsIndexType current_extent) +{ + check_lower_bound(user_index, current_extent, + std::integral_constant<bool, std::is_signed<ExtentsIndexType>::value>{}); + check_upper_bound(user_index, current_extent); + return true; +} + +template<size_t ... RankIndices, + class ExtentsIndexType, size_t ... Exts, + class ... Indices> +MDSPAN_INLINE_FUNCTION +constexpr void +check_all_indices_helper(std::index_sequence<RankIndices...>, + const extents<ExtentsIndexType, Exts...>& exts, + Indices... indices) +{ + // Suppress warning about statement has no effect + (void) _MDSPAN_FOLD_AND( + (check_one_index(indices, exts.extent(RankIndices))) + ); +} + +template<class ExtentsIndexType, size_t ... Exts, + class ... Indices> +MDSPAN_INLINE_FUNCTION +constexpr void +check_all_indices(const extents<ExtentsIndexType, Exts...>& exts, + Indices... indices) +{ + check_all_indices_helper(std::make_index_sequence<sizeof...(Indices)>(), + exts, indices...); +} + } // namespace detail } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp index af44494a98d85bbb377a06e286a55b2b88d30414..ed8aae020b66fac96d50b5d3288b4de497ee9747 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp @@ -18,6 +18,12 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" +#include "layout_stride.hpp" +#include "utility.hpp" +#if MDSPAN_HAS_CXX_17 +#include "../__p2642_bits/layout_padded_fwd.hpp" +#endif +#include <type_traits> namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -108,6 +114,36 @@ class layout_left::mapping { */ } +#if MDSPAN_HAS_CXX_17 + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_left_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v<extents_type, typename _Mapping::extents_type> + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<typename _Mapping::extents_type, extents_type>)) + mapping(const _Mapping& __other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates< + extents_type, _Mapping>(detail::with_rank<extents_type::rank()>{}); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(detail::with_rank<extents_type::rank()>{}, __other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -123,16 +159,7 @@ class layout_left::mapping { * TODO: check precondition * other.required_span_size() is a representable value of type index_type */ - #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=0; r<__extents.rank(); r++) { - if(stride != static_cast<index_type>(other.stride(r))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides."); - } - stride *= __extents.extent(r); - } - #endif + detail::validate_strides(detail::with_rank<extents_type::rank()>{}, layout_left{}, __extents, other); } MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default; @@ -155,14 +182,14 @@ class layout_left::mapping { class... Indices, /* requires */ ( (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (detail::are_valid_indices<index_type, Indices...>()) ) ) _MDSPAN_HOST_DEVICE constexpr index_type operator()(Indices... idxs) const noexcept { +#if ! defined(NDEBUG) + detail::check_all_indices(this->extents(), idxs...); +#endif // ! NDEBUG return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(idxs)...); } @@ -172,9 +199,9 @@ class layout_left::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -187,7 +214,10 @@ class layout_left::mapping { return value; } - template<class OtherExtents> + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -195,7 +225,10 @@ class layout_left::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template<class OtherExtents> + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -204,10 +237,12 @@ class layout_left::mapping { // Not really public, but currently needed to implement fully constexpr useable submdspan: template<size_t N, class SizeType, size_t ... E, size_t ... Idx> + MDSPAN_INLINE_FUNCTION constexpr index_type __get_stride(MDSPAN_IMPL_STANDARD_NAMESPACE::extents<SizeType, E...>,std::integer_sequence<size_t, Idx...>) const { return _MDSPAN_FOLD_TIMES_RIGHT((Idx<N? __extents.template __extent<Idx>():1),1); } template<size_t N> + MDSPAN_INLINE_FUNCTION constexpr index_type __stride() const noexcept { return __get_stride<N>(__extents, std::make_index_sequence<extents_type::rank()>()); } @@ -215,6 +250,18 @@ class layout_left::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp index a0586484202efda912bab37d94df2999165fb079..26115e7a34087efba273ff5e2f645f9d10af4732 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp @@ -18,8 +18,11 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" -#include <stdexcept> #include "layout_stride.hpp" +#include "utility.hpp" +#if MDSPAN_HAS_CXX_17 +#include "../__p2642_bits/layout_padded_fwd.hpp" +#endif namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -113,6 +116,34 @@ class layout_right::mapping { */ } + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_right_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ +#if MDSPAN_HAS_CXX_17 + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v<extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<typename _Mapping::extents_type, extents_type>)) + mapping(const _Mapping &__other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates< + extents_type, _Mapping>(detail::with_rank<extents_type::rank()>{}); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(detail::with_rank<extents_type::rank()>{}, __other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -128,16 +159,7 @@ class layout_right::mapping { * TODO: check precondition * other.required_span_size() is a representable value of type index_type */ - #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=__extents.rank(); r>0; r--) { - if(stride != static_cast<index_type>(other.stride(r-1))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides."); - } - stride *= __extents.extent(r-1); - } - #endif + detail::validate_strides(detail::with_rank<extents_type::rank()>{}, layout_right{}, __extents, other); } MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED mapping& operator=(mapping const&) noexcept = default; @@ -157,26 +179,26 @@ class layout_right::mapping { //-------------------------------------------------------------------------------- MDSPAN_TEMPLATE_REQUIRES( - class... Indices, + class ... Indices, /* requires */ ( - (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (sizeof...(Indices) == extents_type::rank()) && + (detail::are_valid_indices<index_type, Indices...>()) ) ) _MDSPAN_HOST_DEVICE constexpr index_type operator()(Indices... idxs) const noexcept { +#if ! defined(NDEBUG) + detail::check_all_indices(this->extents(), idxs...); +#endif // ! NDEBUG return __compute_offset(__rank_count<0, extents_type::rank()>(), static_cast<index_type>(idxs)...); } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -189,7 +211,10 @@ class layout_right::mapping { return value; } - template<class OtherExtents> + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -197,7 +222,10 @@ class layout_right::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template<class OtherExtents> + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ (Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping<OtherExtents> const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -206,10 +234,12 @@ class layout_right::mapping { // Not really public, but currently needed to implement fully constexpr useable submdspan: template<size_t N, class SizeType, size_t ... E, size_t ... Idx> + MDSPAN_INLINE_FUNCTION constexpr index_type __get_stride(MDSPAN_IMPL_STANDARD_NAMESPACE::extents<SizeType, E...>,std::integer_sequence<size_t, Idx...>) const { return _MDSPAN_FOLD_TIMES_RIGHT((Idx>N? __extents.template __extent<Idx>():1),1); } template<size_t N> + MDSPAN_INLINE_FUNCTION constexpr index_type __stride() const noexcept { return __get_stride<N>(__extents, std::make_index_sequence<extents_type::rank()>()); } @@ -217,6 +247,18 @@ class layout_right::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 030a494529b60043f962d7b1c4348bd4243ee33f..47ef2682d94209445b69629d880d0bd4f57d82aa 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -19,14 +19,16 @@ #include "extents.hpp" #include "trait_backports.hpp" #include "compressed_pair.hpp" +#include "utility.hpp" #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) # include "no_unique_address.hpp" #endif -#include <algorithm> -#include <numeric> #include <array> +#include <type_traits> +#include <utility> + #ifdef __cpp_lib_span #include <span> #endif @@ -38,11 +40,11 @@ namespace MDSPAN_IMPL_STANDARD_NAMESPACE { struct layout_left { template<class Extents> - class mapping; + class mapping; }; struct layout_right { template<class Extents> - class mapping; + class mapping; }; namespace detail { @@ -79,6 +81,7 @@ namespace detail { std::bool_constant<M::is_always_unique()>::value; }; #endif + } // namespace detail struct layout_stride { @@ -88,7 +91,7 @@ struct layout_stride { : private detail::__no_unique_address_emulation< detail::__compressed_pair< Extents, - std::array<typename Extents::index_type, Extents::rank()> + detail::possibly_empty_array<typename Extents::index_type, Extents::rank()> > > #endif @@ -109,7 +112,7 @@ struct layout_stride { //---------------------------------------------------------------------------- - using __strides_storage_t = std::array<index_type, extents_type::rank()>; + using __strides_storage_t = detail::possibly_empty_array<index_type, extents_type::rank()>; using __member_pair_t = detail::__compressed_pair<extents_type, __strides_storage_t>; #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) @@ -158,14 +161,16 @@ struct layout_stride { template <class OtherExtents> MDSPAN_INLINE_FUNCTION static constexpr bool _eq_impl(mapping const& self, mapping<OtherExtents> const& other) noexcept { - return _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */) - && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t<index_type, typename OtherExtents::index_type>; + return _MDSPAN_FOLD_AND((static_cast<common_t>(self.stride(Idxs)) == static_cast<common_t>(other.stride(Idxs))) /* && ... */) + && _MDSPAN_FOLD_AND((static_cast<common_t>(self.extents().extent(Idxs)) == static_cast<common_t>(other.extents().extent(Idxs))) /* || ... */); } template <class OtherExtents> MDSPAN_INLINE_FUNCTION static constexpr bool _not_eq_impl(mapping const& self, mapping<OtherExtents> const& other) noexcept { - return _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */) - || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t<index_type, typename OtherExtents::index_type>; + return _MDSPAN_FOLD_OR((static_cast<common_t>(self.stride(Idxs)) != static_cast<common_t>(other.stride(Idxs))) /* || ... */) + || _MDSPAN_FOLD_OR((static_cast<common_t>(self.extents().extent(Idxs)) != static_cast<common_t>(other.extents().extent(Idxs))) /* || ... */); } template <class... Integral> @@ -192,19 +197,32 @@ struct layout_stride { } template<class IntegralType> - MDSPAN_INLINE_FUNCTION static constexpr const __strides_storage_t fill_strides(const std::array<IntegralType,extents_type::rank()>& s) { return __strides_storage_t{static_cast<index_type>(s[Idxs])...}; } + MDSPAN_TEMPLATE_REQUIRES( + class IntegralType, + (std::is_convertible<IntegralType, typename extents_type::index_type>::value) + ) + MDSPAN_INLINE_FUNCTION + // Need to avoid zero length c-array + static constexpr const __strides_storage_t fill_strides(mdspan_non_standard_tag, const IntegralType (&s)[extents_type::rank()>0?extents_type::rank():1]) { + return __strides_storage_t{static_cast<index_type>(s[Idxs])...}; + } + #ifdef __cpp_lib_span template<class IntegralType> - MDSPAN_INLINE_FUNCTION static constexpr const __strides_storage_t fill_strides(const std::span<IntegralType,extents_type::rank()>& s) { return __strides_storage_t{static_cast<index_type>(s[Idxs])...}; } #endif + MDSPAN_INLINE_FUNCTION + static constexpr std::array<index_type, extents_type::rank()> return_strides(const __strides_storage_t& s) { + return std::array<index_type, extents_type::rank()>{s[Idxs]...}; + } + template<size_t K> MDSPAN_INLINE_FUNCTION static constexpr size_t __return_zero() { return 0; } @@ -218,6 +236,25 @@ struct layout_stride { // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348. using __impl = __deduction_workaround<std::make_index_sequence<Extents::rank()>>; + MDSPAN_FUNCTION + static constexpr __strides_storage_t strides_storage(detail::with_rank<0>) { + return {}; + } + + template <std::size_t N> + MDSPAN_FUNCTION + static constexpr __strides_storage_t strides_storage(detail::with_rank<N>) { + __strides_storage_t s{}; + + extents_type e; + index_type stride = 1; + for(int r = static_cast<int>(extents_type::rank() - 1); r >= 0; r--) { + s[r] = stride; + stride *= e.extent(r); + } + + return s; + } //---------------------------------------------------------------------------- @@ -233,7 +270,21 @@ struct layout_stride { //-------------------------------------------------------------------------------- - MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default; + MDSPAN_INLINE_FUNCTION constexpr mapping() noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + extents_type(), + __strides_storage_t(strides_storage(detail::with_rank<extents_type::rank()>{})) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + {} + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default; MDSPAN_TEMPLATE_REQUIRES( @@ -245,7 +296,6 @@ struct layout_stride { _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t<IntegralTypes>&) ) ) - MDSPAN_INLINE_FUNCTION constexpr mapping( extents_type const& e, @@ -273,7 +323,6 @@ struct layout_stride { */ } -#ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES( class IntegralTypes, /* requires */ ( @@ -285,6 +334,45 @@ struct layout_stride { ) MDSPAN_INLINE_FUNCTION constexpr + mapping( + mdspan_non_standard_tag, + extents_type const& e, + // Need to avoid zero-length c-array + const IntegralTypes (&s)[extents_type::rank()>0?extents_type::rank():1] + ) noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + e, __strides_storage_t(__impl::fill_strides(mdspan_non_standard, s)) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + { + /* + * TODO: check preconditions + * - s[i] > 0 is true for all i in the range [0, rank_ ). + * - REQUIRED-SPAN-SIZE(e, s) is a representable value of type index_type ([basic.fundamental]). + * - If rank_ is greater than 0, then there exists a permutation P of the integers in the + * range [0, rank_), such that s[ pi ] >= s[ pi − 1 ] * e.extent( pi − 1 ) is true for + * all i in the range [1, rank_ ), where pi is the ith element of P. + */ + } + +#ifdef __cpp_lib_span + MDSPAN_TEMPLATE_REQUIRES( + class IntegralTypes, + /* requires */ ( + // MSVC 19.32 does not like using index_type here, requires the typename Extents::index_type + // error C2641: cannot deduce template arguments for 'MDSPAN_IMPL_STANDARD_NAMESPACE::layout_stride::mapping' + _MDSPAN_TRAIT(std::is_convertible, const std::remove_const_t<IntegralTypes>&, typename Extents::index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, typename Extents::index_type, const std::remove_const_t<IntegralTypes>&) + ) + ) + constexpr mapping( extents_type const& e, std::span<IntegralTypes, extents_type::rank()> const& s @@ -332,10 +420,10 @@ struct layout_stride { ) #endif MDSPAN_CONDITIONAL_EXPLICIT( - (!std::is_convertible<typename StridedLayoutMapping::extents_type, extents_type>::value) && - (detail::__is_mapping_of<layout_left, StridedLayoutMapping> || - detail::__is_mapping_of<layout_right, StridedLayoutMapping> || - detail::__is_mapping_of<layout_stride, StridedLayoutMapping>) + !(std::is_convertible<typename StridedLayoutMapping::extents_type, extents_type>::value && + (detail::__is_mapping_of<layout_left, StridedLayoutMapping> || + detail::__is_mapping_of<layout_right, StridedLayoutMapping> || + detail::__is_mapping_of<layout_stride, StridedLayoutMapping>)) ) // needs two () due to comma MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 mapping(StridedLayoutMapping const& other) noexcept // NOLINT(google-explicit-constructor) @@ -374,13 +462,14 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION constexpr std::array< index_type, extents_type::rank() > strides() const noexcept { - return __strides_storage(); + return __impl::return_strides(__strides_storage()); } MDSPAN_INLINE_FUNCTION constexpr index_type required_span_size() const noexcept { index_type span_size = 1; - for(unsigned r = 0; r < extents_type::rank(); r++) { + // using int here to avoid warning about pointless comparison to 0 + for(int r = 0; r < static_cast<int>(extents_type::rank()); r++) { // Return early if any of the extents are zero if(extents().extent(r)==0) return 0; span_size += ( static_cast<index_type>(extents().extent(r) - 1 ) * __strides_storage()[r]); @@ -393,12 +482,14 @@ struct layout_stride { class... Indices, /* requires */ ( sizeof...(Indices) == Extents::rank() && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + (detail::are_valid_indices<index_type, Indices...>()) ) ) MDSPAN_FORCE_INLINE_FUNCTION constexpr index_type operator()(Indices... idxs) const noexcept { +#if ! defined(NDEBUG) + detail::check_all_indices(this->extents(), idxs...); +#endif // ! NDEBUG return static_cast<index_type>(__impl::_call_op_impl(*this, static_cast<index_type>(idxs)...)); } @@ -409,18 +500,58 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } - MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { + + private: + MDSPAN_INLINE_FUNCTION + constexpr bool exhaustive_for_nonzero_span_size() const + { return required_span_size() == __get_size(extents(), std::make_index_sequence<extents_type::rank()>()); } + + MDSPAN_INLINE_FUNCTION + constexpr bool is_exhaustive_impl(detail::with_rank<0>) const + { + return true; + } + MDSPAN_INLINE_FUNCTION + constexpr bool is_exhaustive_impl(detail::with_rank<1>) const + { + if (required_span_size() != static_cast<index_type>(0)) { + return exhaustive_for_nonzero_span_size(); + } + return stride(0) == 1; + } + template <std::size_t N> + MDSPAN_INLINE_FUNCTION + constexpr bool is_exhaustive_impl(detail::with_rank<N>) const + { + if (required_span_size() != static_cast<index_type>(0)) { + return exhaustive_for_nonzero_span_size(); + } + + rank_type r_largest = 0; + for (rank_type r = 1; r < extents_type::rank(); r++) { + if (stride(r) > stride(r_largest)) { + r_largest = r; + } + } + for (rank_type r = 0; r < extents_type::rank(); r++) { + if (extents().extent(r) == 0 && r != r_largest) { + return false; + } + } + return true; + } + + public: + MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { + return is_exhaustive_impl(detail::with_rank<extents_type::rank()>{}); + } MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION - constexpr index_type stride(rank_type r) const noexcept -#if MDSPAN_HAS_CXX_20 - requires ( Extents::rank() > 0 ) -#endif - { + constexpr index_type stride(rank_type r) const noexcept { return __strides_storage()[r]; } @@ -443,12 +574,9 @@ struct layout_stride { #endif MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept { - bool strides_match = true; - for(rank_type r = 0; r < extents_type::rank(); r++) - strides_match = strides_match && (x.stride(r) == y.stride(r)); return (x.extents() == y.extents()) && - (__impl::__OFFSET(y)== static_cast<typename StridedLayoutMapping::index_type>(0)) && - strides_match; + (__impl::__OFFSET(y) == static_cast<typename StridedLayoutMapping::index_type>(0)) && + detail::rankwise_equal(detail::with_rank<extents_type::rank()>{}, x, y, detail::stride); } // This one is not technically part of the proposal. Just here to make implementation a bit more optimal hopefully @@ -474,7 +602,7 @@ struct layout_stride { ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(const mapping& x, const StridedLayoutMapping& y) noexcept { - return not (x == y); + return !(x == y); } MDSPAN_TEMPLATE_REQUIRES( @@ -489,7 +617,51 @@ struct layout_stride { } #endif + // [mdspan.submdspan.mapping], submdspan mapping specialization + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; }; +namespace detail { + +template <class Layout, class Extents, class Mapping> +MDSPAN_INLINE_FUNCTION +constexpr void validate_strides(with_rank<0>, Layout, const Extents&, const Mapping&) +{} + +template <std::size_t N, class Layout, class Extents, class Mapping> +MDSPAN_INLINE_FUNCTION +constexpr void validate_strides(with_rank<N>, Layout, const Extents& ext, const Mapping& other) +{ + static_assert(std::is_same<typename Mapping::layout_type, layout_stride>::value && + (std::is_same<Layout, layout_left>::value || + std::is_same<Layout, layout_right>::value) + , "This function is only intended to validate construction of " + "a layout_left or layout_right mapping from a layout_stride mapping."); + + constexpr auto is_left = std::is_same<Layout, layout_left>::value; + + typename Extents::index_type expected_stride = 1; + + for (std::size_t r = 0; r < N; r++) { + const std::size_t s = is_left ? r : N - 1 - r; + + MDSPAN_IMPL_PRECONDITION(common_integral_compare(expected_stride, other.stride(s)) + && "invalid strides for layout_{left,right}"); + + expected_stride *= ext.extent(s); + } +} + +} // namespace detail } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp index 3eeb39755c8aed3690892ac4cb2b2cb9c9935c96..b60c4261779e8cfe0a8f5bf875714284977a0533 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/macros.hpp @@ -18,7 +18,12 @@ #include "config.hpp" +#include <cstdio> +#include <cstdlib> #include <type_traits> // std::is_void +#if defined(_MDSPAN_HAS_CUDA) || defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_SYCL) +#include "assert.h" +#endif #ifndef _MDSPAN_HOST_DEVICE # if defined(_MDSPAN_HAS_CUDA) || defined(_MDSPAN_HAS_HIP) @@ -101,6 +106,69 @@ #define MDSPAN_IMPL_STANDARD_NAMESPACE_STRING MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_STANDARD_NAMESPACE) #define MDSPAN_IMPL_PROPOSED_NAMESPACE_STRING MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_STANDARD_NAMESPACE) "::" MDSPAN_PP_STRINGIFY(MDSPAN_IMPL_PROPOSED_NAMESPACE) +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace detail { + +#if defined(_MDSPAN_HAS_CUDA) || defined(_MDSPAN_HAS_HIP) +MDSPAN_FUNCTION inline void default_precondition_violation_handler(const char* cond, const char* file, unsigned line) +{ + printf("%s:%u: precondition failure: `%s`\n", file, line, cond); + assert(0); +} +#elif defined(_MDSPAN_HAS_SYCL) +MDSPAN_FUNCTION inline void default_precondition_violation_handler(const char* cond, const char* file, unsigned line) +{ + sycl::ext::oneapi::experimental::printf("%s:%u: precondition failure: `%s`\n", file, line, cond); + assert(0); +} +#else +MDSPAN_FUNCTION inline void default_precondition_violation_handler(const char* cond, const char* file, unsigned line) +{ + std::fprintf(stderr, "%s:%u: precondition failure: `%s`\n", file, line, cond); + std::abort(); +} +#endif + +} // namespace detail +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE + +#ifndef MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER +#define MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER(cond, file, line) \ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::default_precondition_violation_handler(cond, file, line) +#endif + +#ifndef MDSPAN_IMPL_CHECK_PRECONDITION + #ifndef NDEBUG + #define MDSPAN_IMPL_CHECK_PRECONDITION 0 + #else + #define MDSPAN_IMPL_CHECK_PRECONDITION 1 + #endif +#endif + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace detail { + +template <bool check = MDSPAN_IMPL_CHECK_PRECONDITION> +MDSPAN_FUNCTION constexpr void precondition(const char* cond, const char* file, unsigned line) +{ + if (!check) { return; } + // in case the macro doesn't use the arguments for custom macros + (void) cond; + (void) file; + (void) line; + MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER(cond, file, line); +} + +} // namespace detail +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE + +#define MDSPAN_IMPL_PRECONDITION(...) \ + do { \ + if (!(__VA_ARGS__)) { \ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::precondition(#__VA_ARGS__, __FILE__, __LINE__); \ + } \ + } while (0) + // </editor-fold> end Preprocessor helpers }}}1 //============================================================================== @@ -574,7 +642,7 @@ __fold_left_assign_impl(Args&&... args) { template <class... Args> -constexpr __mdspan_enable_fold_comma __fold_comma_impl(Args&&... args) noexcept { return { }; } +constexpr __mdspan_enable_fold_comma __fold_comma_impl(Args&&...) noexcept { return { }; } template <bool... Bs> struct __bools; diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp index 6febe30021501dbbb2d46656a2877b96c8898cd1..23114aa55068317216e38acb444687a7ef91c98a 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp @@ -34,6 +34,8 @@ class mdspan private: static_assert(detail::__is_extents_v<Extents>, MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::mdspan's Extents template parameter must be a specialization of " MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::extents."); + static_assert(std::is_same<ElementType, typename AccessorPolicy::element_type>::value, + MDSPAN_IMPL_STANDARD_NAMESPACE_STRING "::mdspan's ElementType template parameter must be the same as its AccessorPolicy::element_type."); // Workaround for non-deducibility of the index sequence template parameter if it's given at the top level template <class> @@ -55,6 +57,13 @@ private: ReferenceType __callop(mdspan const& __self, const std::array<SizeType, N>& indices) noexcept { return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); } +#ifdef __cpp_lib_span + template <class ReferenceType, class SizeType, size_t N> + MDSPAN_FORCE_INLINE_FUNCTION static constexpr + ReferenceType __callop(mdspan const& __self, const std::span<SizeType, N>& indices) noexcept { + return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); + } +#endif }; public: @@ -109,9 +118,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) && + (detail::are_valid_indices<index_type, SizeTypes...>()) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) ) @@ -125,8 +133,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -142,8 +150,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -160,7 +168,7 @@ public: (MDSPAN_INLINE_FUNCTION constexpr), mdspan, (data_handle_type p, const extents_type& exts), , /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) && - _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type)) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const extents_type&)) ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type())) { } @@ -179,10 +187,14 @@ public: MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor, /* requires */ ( - _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping<OtherExtents>) && - _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const typename OtherLayoutPolicy::template mapping<OtherExtents>&) && + _MDSPAN_TRAIT(std::is_constructible, accessor_type, const OtherAccessor&) ) ) + MDSPAN_CONDITIONAL_EXPLICIT( + !_MDSPAN_TRAIT(std::is_convertible, const typename OtherLayoutPolicy::template mapping<OtherExtents>&, mapping_type) || + !_MDSPAN_TRAIT(std::is_convertible, const OtherAccessor&, accessor_type) + ) MDSPAN_INLINE_FUNCTION constexpr mdspan(const mdspan<OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>& other) : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref())) @@ -226,8 +238,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -240,8 +252,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -271,9 +283,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + extents_type::rank() == sizeof...(SizeTypes) && + (detail::are_valid_indices<index_type, SizeTypes...>()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -285,8 +296,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -299,8 +310,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -311,8 +322,8 @@ public: #endif // __cpp_lib_span #endif // MDSPAN_USE_PAREN_OPERATOR - MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept { - return __impl::__size(*this); + MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept { + return static_cast<size_type>(__impl::__size(*this)); }; MDSPAN_INLINE_FUNCTION constexpr bool empty() const noexcept { @@ -346,13 +357,13 @@ public: //-------------------------------------------------------------------------------- // [mdspan.basic.obs], mdspan observers of the mapping - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return mapping_type::is_always_unique(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return __mapping_ref().is_unique(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return __mapping_ref().is_exhaustive(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return __mapping_ref().is_strided(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const { return __mapping_ref().is_unique(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const { return __mapping_ref().is_exhaustive(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const { return __mapping_ref().is_strided(); }; MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return __mapping_ref().stride(r); }; private: @@ -374,7 +385,7 @@ private: #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) MDSPAN_TEMPLATE_REQUIRES( class ElementType, class... SizeTypes, - /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) && + /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, size_t) /* && ... */) && (sizeof...(SizeTypes) > 0) ) MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...) diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f7f39d6024e829cb46b25efd04522f7327d63e8c --- /dev/null +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/utility.hpp @@ -0,0 +1,172 @@ +#pragma once + +#include <cstddef> +#include <type_traits> +#include <array> +#include <utility> + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace detail { + +// type alias used for rank-based tag dispatch +// +// this is used to enable alternatives to constexpr if when building for C++14 +// +template <std::size_t N> +using with_rank = std::integral_constant<std::size_t, N>; + +template <class I1, class I2> +MDSPAN_INLINE_FUNCTION +constexpr bool common_integral_compare(I1 x, I2 y) +{ + static_assert(std::is_integral<I1>::value && + std::is_integral<I2>::value, ""); + + using I = std::common_type_t<I1, I2>; + return static_cast<I>(x) == static_cast<I>(y); +} + +template <class T1, class T2, class F> +MDSPAN_INLINE_FUNCTION +constexpr bool rankwise_equal(with_rank<0>, const T1&, const T2&, F) +{ + return true; +} + +template <std::size_t N, class T1, class T2, class F> +MDSPAN_INLINE_FUNCTION +constexpr bool rankwise_equal(with_rank<N>, const T1& x, const T2& y, F func) +{ + bool match = true; + + for (std::size_t r = 0; r < N; r++) { + match = match && common_integral_compare(func(x, r), func(y, r)); + } + + return match; +} + +constexpr struct +{ + template <class T, class I> + MDSPAN_INLINE_FUNCTION + constexpr auto operator()(const T& x, I i) const + { + return x.extent(i); + } +} extent; + +constexpr struct +{ + template <class T, class I> + MDSPAN_INLINE_FUNCTION + constexpr auto operator()(const T& x, I i) const + { + return x.stride(i); + } +} stride; + +// same as std::integral_constant but with __host__ __device__ annotations on +// the implicit conversion function and the call operator +template <class T, T v> +struct integral_constant { + using value_type = T; + using type = integral_constant<T, v>; + + static constexpr T value = v; + + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr integral_constant() = default; + + // These interop functions work, because other than the value_type operator + // everything of std::integral_constant works on device (defaulted functions) + MDSPAN_FUNCTION + constexpr integral_constant(std::integral_constant<T,v>) {}; + + MDSPAN_FUNCTION constexpr operator std::integral_constant<T,v>() const noexcept { + return std::integral_constant<T,v>{}; + } + + MDSPAN_FUNCTION constexpr operator value_type() const noexcept { + return value; + } + + MDSPAN_FUNCTION constexpr value_type operator()() const noexcept { + return value; + } +}; + +// The tuple implementation only comes in play when using capabilities +// such as submdspan which require C++17 anyway +#if MDSPAN_HAS_CXX_17 +template<class T, size_t Idx> +struct tuple_member { + using type = T; + static constexpr size_t idx = Idx; + T val; + MDSPAN_FUNCTION constexpr T& get() { return val; } + MDSPAN_FUNCTION constexpr const T& get() const { return val; } +}; + +// A helper class which will be used via a fold expression to +// select the type with the correct Idx in a pack of tuple_member +template<size_t SearchIdx, size_t Idx, class T> +struct tuple_idx_matcher { + using type = tuple_member<T, Idx>; + template<class Other> + MDSPAN_FUNCTION + constexpr auto operator | (Other v) const { + if constexpr (Idx == SearchIdx) { return *this; } + else { return v; } + } +}; + +template<class IdxSeq, class ... Elements> +struct tuple_impl; + +template<size_t ... Idx, class ... Elements> +struct tuple_impl<std::index_sequence<Idx...>, Elements...>: public tuple_member<Elements, Idx> ... { + + MDSPAN_FUNCTION + constexpr tuple_impl(Elements ... vals):tuple_member<Elements, Idx>{vals}... {} + + template<size_t N> + MDSPAN_FUNCTION + constexpr auto& get() { + using base_t = decltype((tuple_idx_matcher<N, Idx, Elements>() | ...) ); + return base_t::type::get(); + } + template<size_t N> + MDSPAN_FUNCTION + constexpr const auto& get() const { + using base_t = decltype((tuple_idx_matcher<N, Idx, Elements>() | ...) ); + return base_t::type::get(); + } +}; + +// A simple tuple-like class for representing slices internally and is compatible with device code +// This doesn't support type access since we don't need it +// This is not meant as an external API +template<class ... Elements> +struct tuple: public tuple_impl<decltype(std::make_index_sequence<sizeof...(Elements)>()), Elements...> { + MDSPAN_FUNCTION + constexpr tuple(Elements ... vals):tuple_impl<decltype(std::make_index_sequence<sizeof...(Elements)>()), Elements ...>(vals ...) {} +}; + +template<size_t Idx, class ... Args> +MDSPAN_FUNCTION +constexpr auto& get(tuple<Args...>& vals) { return vals.template get<Idx>(); } + +template<size_t Idx, class ... Args> +MDSPAN_FUNCTION +constexpr const auto& get(const tuple<Args...>& vals) { return vals.template get<Idx>(); } + +template<class ... Elements> +tuple(Elements ...) -> tuple<Elements...>; +#endif +} // namespace detail + +constexpr struct mdspan_non_standard_tag { +} mdspan_non_standard; + +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp index 3950273a83dc114809a4f44151150ed4b3430fbb..bdc5925f715190b2d7163ec75e0c5956df706df9 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp @@ -103,8 +103,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, SizeTypes...>()) && + _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) && (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) || container_is_array<container_type>::value) && @@ -133,61 +133,29 @@ public: ) : map_(m), ctr_(container_is_array<container_type>::construct(map_)) { } - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(const container_type& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(ctr) - { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (const container_type& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, const container_type& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(ctr) { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); } - constexpr mdarray(const container_type& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, const container_type& ctr) : map_(m), ctr_(ctr) { assert(ctr.size() >= static_cast<size_t>(map_.required_span_size())); } - - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(container_type&& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(std::move(ctr)) - { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (container_type&& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, container_type&& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); } - constexpr mdarray(container_type&& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, container_type&& ctr) : map_(m), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); } - MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer, /* requires */ ( @@ -229,7 +197,7 @@ public: _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, const container_type& ctr, const Alloc& a) : map_(exts), ctr_(ctr, a) { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); } @@ -238,7 +206,7 @@ public: /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, const container_type& ctr, const Alloc& a) : map_(map), ctr_(ctr, a) { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); } @@ -248,7 +216,7 @@ public: _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, container_type&& ctr, const Alloc& a) : map_(exts), ctr_(std::move(ctr), a) { assert(ctr_.size() >= static_cast<size_t>(map_.required_span_size())); } @@ -257,7 +225,7 @@ public: /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, container_type&& ctr, const Alloc& a) : map_(map), ctr_(std::move(ctr), a) { assert(ctr_.size() >= map_.required_span_size()); } @@ -344,8 +312,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, SizeTypes...>()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -356,8 +324,8 @@ public: MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices<index_type, SizeTypes...>()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -433,8 +401,9 @@ public: class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, mdspan_type, - mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>) + _MDSPAN_TRAIT(std::is_assignable, + mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>, + mdspan_type) ) ) constexpr operator mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType> () { @@ -445,8 +414,9 @@ public: class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type, - mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>) + _MDSPAN_TRAIT(std::is_assignable, + mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType>, + const_mdspan_type) ) ) constexpr operator mdspan<OtherElementType, OtherExtents, OtherLayoutType, OtherAccessorType> () const { diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2389_bits/dims.hpp similarity index 59% rename from packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp rename to packages/kokkos/tpls/mdspan/include/experimental/__p2389_bits/dims.hpp index 1328c93135243c7906643157af5d66b8dc4d1cbb..00045215c489bfaa04bd0d7967a85b49559fb714 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2389_bits/dims.hpp @@ -14,11 +14,15 @@ // //@HEADER -#ifndef KOKKOS_DECLARE_HBWSPACE_HPP -#define KOKKOS_DECLARE_HBWSPACE_HPP +#pragma once -#ifdef KOKKOS_ENABLE_HBWSPACE -#include <Kokkos_HBWSpace.hpp> -#endif +// backward compatibility import into experimental +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { -#endif +template< ::std::size_t Rank, class IndexType = std::size_t> +using dims = + :: MDSPAN_IMPL_STANDARD_NAMESPACE :: dextents<IndexType, Rank>; + +} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE +} // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp index 58f38620ba1a484e6cf1ea547232d02febd08bbd..89ba8202fb16a090bfa9352a4dcf7041cf53c90f 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp @@ -20,7 +20,6 @@ #include <type_traits> namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace { template<class T> @@ -29,6 +28,7 @@ namespace { template<class T, T val> struct __mdspan_is_integral_constant<std::integral_constant<T,val>>: std::true_type {}; } + // Slice Specifier allowing for strides and compile time extent template <class OffsetType, class ExtentType, class StrideType> struct strided_slice { @@ -36,14 +36,13 @@ struct strided_slice { using extent_type = ExtentType; using stride_type = StrideType; - OffsetType offset; - ExtentType extent; - StrideType stride; + _MDSPAN_NO_UNIQUE_ADDRESS OffsetType offset{}; + _MDSPAN_NO_UNIQUE_ADDRESS ExtentType extent{}; + _MDSPAN_NO_UNIQUE_ADDRESS StrideType stride{}; static_assert(std::is_integral_v<OffsetType> || __mdspan_is_integral_constant<OffsetType>::value); static_assert(std::is_integral_v<ExtentType> || __mdspan_is_integral_constant<ExtentType>::value); static_assert(std::is_integral_v<StrideType> || __mdspan_is_integral_constant<StrideType>::value); }; -} // MDSPAN_IMPL_PROPOSED_NAMESPACE } // MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp index b9672b7f9ac357834347b9df2ab25c63c5862acf..abddd0b59df170f2b16f7e5d301e45378a42bdee 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp @@ -20,23 +20,21 @@ #include "submdspan_mapping.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { template <class ElementType, class Extents, class LayoutPolicy, class AccessorPolicy, class... SliceSpecifiers> MDSPAN_INLINE_FUNCTION constexpr auto submdspan(const mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy> &src, SliceSpecifiers... slices) { - const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...); + const auto sub_submdspan_mapping_result = submdspan_mapping(src.mapping(), slices...); // NVCC has a problem with the deduction so lets figure out the type - using sub_mapping_t = std::remove_cv_t<decltype(sub_mapping_offset.mapping)>; + using sub_mapping_t = std::remove_cv_t<decltype(sub_submdspan_mapping_result.mapping)>; using sub_extents_t = typename sub_mapping_t::extents_type; using sub_layout_t = typename sub_mapping_t::layout_type; using sub_accessor_t = typename AccessorPolicy::offset_policy; return mdspan<ElementType, sub_extents_t, sub_layout_t, sub_accessor_t>( - src.accessor().offset(src.data_handle(), sub_mapping_offset.offset), - sub_mapping_offset.mapping, + src.accessor().offset(src.data_handle(), sub_submdspan_mapping_result.offset), + sub_submdspan_mapping_result.mapping, sub_accessor_t(src.accessor())); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp index f56ce023f1652d7f01e982448faafb8d5a12c542..4fe5dc6e29a00add050902edca1fa7ac1ee32f49 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp @@ -16,11 +16,12 @@ #pragma once -#include <tuple> +#include <complex> #include "strided_slice.hpp" +#include "../__p0009_bits/utility.hpp" + namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace detail { // Mapping from submapping ranks to srcmapping ranks @@ -53,6 +54,37 @@ template <class OffsetType, class ExtentType, class StrideType> struct is_strided_slice< strided_slice<OffsetType, ExtentType, StrideType>> : std::true_type {}; +// Helper for identifying valid pair like things +template <class T, class IndexType> struct index_pair_like : std::false_type {}; + +template <class IdxT1, class IdxT2, class IndexType> +struct index_pair_like<std::pair<IdxT1, IdxT2>, IndexType> { + static constexpr bool value = std::is_convertible_v<IdxT1, IndexType> && + std::is_convertible_v<IdxT2, IndexType>; +}; + +template <class IdxT1, class IdxT2, class IndexType> +struct index_pair_like<std::tuple<IdxT1, IdxT2>, IndexType> { + static constexpr bool value = std::is_convertible_v<IdxT1, IndexType> && + std::is_convertible_v<IdxT2, IndexType>; +}; + +template <class IdxT1, class IdxT2, class IndexType> +struct index_pair_like<tuple<IdxT1, IdxT2>, IndexType> { + static constexpr bool value = std::is_convertible_v<IdxT1, IndexType> && + std::is_convertible_v<IdxT2, IndexType>; +}; + +template <class IdxT, class IndexType> +struct index_pair_like<std::complex<IdxT>, IndexType> { + static constexpr bool value = std::is_convertible_v<IdxT, IndexType>; +}; + +template <class IdxT, class IndexType> +struct index_pair_like<std::array<IdxT, 2>, IndexType> { + static constexpr bool value = std::is_convertible_v<IdxT, IndexType>; +}; + // first_of(slice): getting begin of slice specifier range MDSPAN_TEMPLATE_REQUIRES( class Integral, @@ -63,19 +95,48 @@ constexpr Integral first_of(const Integral &i) { return i; } +template<class Integral, Integral v> MDSPAN_INLINE_FUNCTION -constexpr std::integral_constant<size_t, 0> +constexpr Integral first_of(const std::integral_constant<Integral, v>&) { + return integral_constant<Integral, v>(); +} + +MDSPAN_INLINE_FUNCTION +constexpr integral_constant<size_t, 0> first_of(const ::MDSPAN_IMPL_STANDARD_NAMESPACE::full_extent_t &) { - return std::integral_constant<size_t, 0>(); + return integral_constant<size_t, 0>(); } MDSPAN_TEMPLATE_REQUIRES( class Slice, - /* requires */(std::is_convertible_v<Slice, std::tuple<size_t, size_t>>) + /* requires */(index_pair_like<Slice, size_t>::value) ) MDSPAN_INLINE_FUNCTION constexpr auto first_of(const Slice &i) { - return std::get<0>(i); + return get<0>(i); +} + +MDSPAN_TEMPLATE_REQUIRES( + class IdxT1, class IdxT2, + /* requires */ (index_pair_like<std::tuple<IdxT1, IdxT2>, size_t>::value) + ) +constexpr auto first_of(const std::tuple<IdxT1, IdxT2>& i) { + return get<0>(i); +} + +MDSPAN_TEMPLATE_REQUIRES( + class IdxT1, class IdxT2, + /* requires */ (index_pair_like<std::pair<IdxT1, IdxT2>, size_t>::value) + ) +MDSPAN_INLINE_FUNCTION +constexpr auto first_of(const std::pair<IdxT1, IdxT2>& i) { + return i.first; +} + +template<class T> +MDSPAN_INLINE_FUNCTION +constexpr auto first_of(const std::complex<T> &i) { + return i.real(); } template <class OffsetType, class ExtentType, class StrideType> @@ -101,12 +162,35 @@ constexpr Integral MDSPAN_TEMPLATE_REQUIRES( size_t k, class Extents, class Slice, - /* requires */(std::is_convertible_v<Slice, std::tuple<size_t, size_t>>) + /* requires */(index_pair_like<Slice, size_t>::value) ) MDSPAN_INLINE_FUNCTION constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &, const Slice &i) { - return std::get<1>(i); + return get<1>(i); +} + +MDSPAN_TEMPLATE_REQUIRES( + size_t k, class Extents, class IdxT1, class IdxT2, + /* requires */ (index_pair_like<std::tuple<IdxT1, IdxT2>, size_t>::value) + ) +constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &, const std::tuple<IdxT1, IdxT2>& i) { + return get<1>(i); +} + +MDSPAN_TEMPLATE_REQUIRES( + size_t k, class Extents, class IdxT1, class IdxT2, + /* requires */ (index_pair_like<std::pair<IdxT1, IdxT2>, size_t>::value) + ) +MDSPAN_INLINE_FUNCTION +constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &, const std::pair<IdxT1, IdxT2>& i) { + return i.second; +} + +template<size_t k, class Extents, class T> +MDSPAN_INLINE_FUNCTION +constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &, const std::complex<T> &i) { + return i.imag(); } // Suppress spurious warning with NVCC about no return statement. @@ -135,7 +219,7 @@ constexpr auto last_of(std::integral_constant<size_t, k>, const Extents &ext, if constexpr (Extents::static_extent(k) == dynamic_extent) { return ext.extent(k); } else { - return std::integral_constant<size_t, Extents::static_extent(k)>(); + return integral_constant<size_t, Extents::static_extent(k)>(); } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) // Even with CUDA_ARCH protection this thing warns about calling host function @@ -167,7 +251,7 @@ last_of(std::integral_constant<size_t, k>, const Extents &, template <class T> MDSPAN_INLINE_FUNCTION constexpr auto stride_of(const T &) { - return std::integral_constant<size_t, 1>(); + return integral_constant<size_t, 1>(); } template <class OffsetType, class ExtentType, class StrideType> @@ -190,7 +274,7 @@ constexpr auto divide(const std::integral_constant<T0, v0> &, const std::integral_constant<T1, v1> &) { // cutting short division by zero // this is used for strided_slice with zero extent/stride - return std::integral_constant<IndexT, v0 == 0 ? 0 : v0 / v1>(); + return integral_constant<IndexT, v0 == 0 ? 0 : v0 / v1>(); } // multiply which can deal with integral constant preservation @@ -204,7 +288,7 @@ template <class IndexT, class T0, T0 v0, class T1, T1 v1> MDSPAN_INLINE_FUNCTION constexpr auto multiply(const std::integral_constant<T0, v0> &, const std::integral_constant<T1, v1> &) { - return std::integral_constant<IndexT, v0 * v1>(); + return integral_constant<IndexT, v0 * v1>(); } // compute new static extent from range, preserving static knowledge @@ -218,6 +302,12 @@ struct StaticExtentFromRange<std::integral_constant<Integral0, val0>, constexpr static size_t value = val1 - val0; }; +template <class Integral0, Integral0 val0, class Integral1, Integral1 val1> +struct StaticExtentFromRange<integral_constant<Integral0, val0>, + integral_constant<Integral1, val1>> { + constexpr static size_t value = val1 - val0; +}; + // compute new static extent from strided_slice, preserving static // knowledge template <class Arg0, class Arg1> struct StaticExtentFromStridedRange { @@ -230,6 +320,12 @@ struct StaticExtentFromStridedRange<std::integral_constant<Integral0, val0>, constexpr static size_t value = val0 > 0 ? 1 + (val0 - 1) / val1 : 0; }; +template <class Integral0, Integral0 val0, class Integral1, Integral1 val1> +struct StaticExtentFromStridedRange<integral_constant<Integral0, val0>, + integral_constant<Integral1, val1>> { + constexpr static size_t value = val0 > 0 ? 1 + (val0 - 1) / val1 : 0; +}; + // creates new extents through recursive calls to next_extent member function // next_extent has different overloads for different types of stride specifiers template <size_t K, class Extents, size_t... NewExtents> @@ -319,5 +415,4 @@ constexpr auto submdspan_extents(const extents<IndexType, Extents...> &src_exts, return detail::extents_constructor<ext_t::rank(), ext_t>::next_extent( src_exts, slices...); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index 48778d57e75fcc1134504fb2b9902599357bd536..46ccbaadebe027b0f7453f7dd398204da0a7c9e5 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -18,37 +18,107 @@ #include <array> #include <type_traits> -#include <tuple> #include <utility> // index_sequence +#include "../__p0009_bits/utility.hpp" + +// Suppress spurious warning with NVCC about no return statement. +// This is a known issue in NVCC and NVC++ +// Depending on the CUDA and GCC version we need both the builtin +// and the diagnostic push. I tried really hard to find something shorter +// but no luck ... +#if defined __NVCC__ +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress = implicit_return_from_non_void_function +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic push +#pragma diag_suppress implicit_return_from_non_void_function +#endif +#endif +#elif defined __NVCOMPILER +#pragma diagnostic push +#pragma diag_suppress = implicit_return_from_non_void_function +#endif namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { //****************************************** // Return type of submdspan_mapping overloads //****************************************** -template <class Mapping> struct mapping_offset { - Mapping mapping; +template <class LayoutMapping> struct submdspan_mapping_result { + _MDSPAN_NO_UNIQUE_ADDRESS LayoutMapping mapping{}; size_t offset; }; -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE namespace detail { -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank; + +// We use const Slice& and not Slice&& because the various +// submdspan_mapping_impl overloads use their slices arguments +// multiple times. This makes perfect forwarding not useful, but we +// still don't want to pass those (possibly of size 64 x 3 bits) +// objects by value. +template <class IndexType, class Slice> +MDSPAN_INLINE_FUNCTION constexpr bool +one_slice_out_of_bounds(const IndexType &ext, const Slice &slice) { + using common_t = + std::common_type_t<decltype(detail::first_of(slice)), IndexType>; + return static_cast<common_t>(detail::first_of(slice)) == + static_cast<common_t>(ext); +} + +template <size_t... RankIndices, class IndexType, size_t... Exts, + class... Slices> +MDSPAN_INLINE_FUNCTION constexpr bool +any_slice_out_of_bounds_helper(std::index_sequence<RankIndices...>, + const extents<IndexType, Exts...> &exts, + const Slices &... slices) { + return _MDSPAN_FOLD_OR( + (one_slice_out_of_bounds(exts.extent(RankIndices), slices))); +} + +template <class IndexType, size_t... Exts, class... Slices> +MDSPAN_INLINE_FUNCTION constexpr bool +any_slice_out_of_bounds(const extents<IndexType, Exts...> &exts, + const Slices &... slices) { + return any_slice_out_of_bounds_helper( + std::make_index_sequence<sizeof...(Slices)>(), exts, slices...); +} // constructs sub strides +template<class T, size_t N> +struct sub_strides +{ + T values[N > 0 ? N : 1]; +}; + template <class SrcMapping, class... slice_strides, size_t... InvMapIdxs> -MDSPAN_INLINE_FUNCTION -constexpr auto -construct_sub_strides(const SrcMapping &src_mapping, - std::index_sequence<InvMapIdxs...>, - const std::tuple<slice_strides...> &slices_stride_factor) { +MDSPAN_INLINE_FUNCTION constexpr auto construct_sub_strides( + const SrcMapping &src_mapping, std::index_sequence<InvMapIdxs...>, + const MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<slice_strides...> &slices_stride_factor) { using index_type = typename SrcMapping::index_type; - return std::array<typename SrcMapping::index_type, sizeof...(InvMapIdxs)>{ + return sub_strides<typename SrcMapping::index_type, sizeof...(InvMapIdxs)>{{ (static_cast<index_type>(src_mapping.stride(InvMapIdxs)) * - static_cast<index_type>(std::get<InvMapIdxs>(slices_stride_factor)))...}; + static_cast<index_type>(get<InvMapIdxs>(slices_stride_factor)))...}}; } + +template<class SliceSpecifier, class IndexType> +struct is_range_slice { + constexpr static bool value = + std::is_same_v<SliceSpecifier, full_extent_t> || + index_pair_like<SliceSpecifier, IndexType>::value; +}; + +template<class SliceSpecifier, class IndexType> +constexpr bool is_range_slice_v = is_range_slice<SliceSpecifier, IndexType>::value; + +template<class SliceSpecifier, class IndexType> +struct is_index_slice { + constexpr static bool value = std::is_convertible_v<SliceSpecifier, IndexType>; +}; + +template<class SliceSpecifier, class IndexType> +constexpr bool is_index_slice_v = is_index_slice<SliceSpecifier, IndexType>::value; + } // namespace detail //********************************** @@ -57,106 +127,228 @@ construct_sub_strides(const SrcMapping &src_mapping, namespace detail { // Figure out whether to preserve layout_left -template <class IndexSequence, size_t SubRank, class... SliceSpecifiers> -struct preserve_layout_left_mapping; +template <class IndexType, size_t SubRank, class IndexSequence, + class... SliceSpecifiers> +struct deduce_layout_left_submapping; -template <class... SliceSpecifiers, size_t... Idx, size_t SubRank> -struct preserve_layout_left_mapping<std::index_sequence<Idx...>, SubRank, - SliceSpecifiers...> { - constexpr static bool value = - // Preserve layout for rank 0 - (SubRank == 0) || - ( - // Slice specifiers up to subrank need to be full_extent_t - except - // for the last one which could also be tuple but not a strided index - // range slice specifiers after subrank are integrals - ((Idx > SubRank - 1) || // these are only integral slice specifiers - (std::is_same_v<SliceSpecifiers, full_extent_t>) || - ((Idx == SubRank - 1) && - std::is_convertible_v<SliceSpecifiers, std::tuple<size_t, size_t>>)) && - ...); +template <class IndexType, size_t SubRank, size_t... Idx, + class... SliceSpecifiers> +struct deduce_layout_left_submapping< + IndexType, SubRank, std::index_sequence<Idx...>, SliceSpecifiers...> { + + using count_range = index_sequence_scan_impl< + 0u, (is_index_slice_v<SliceSpecifiers, IndexType> ? 0u : 1u)...>; + + constexpr static int gap_len = + (((Idx > 0 && count_range::get(Idx) == 1 && + is_index_slice_v<SliceSpecifiers, IndexType>) + ? 1 + : 0) + + ... + 0); + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_left_value() { + // Use layout_left for rank 0 + if constexpr (SubRank == 0) { + return true; + // Use layout_left for rank 1 result if leftmost slice specifier is range like + } else if constexpr (SubRank == 1) { + return ((Idx > 0 || is_range_slice_v<SliceSpecifiers, IndexType>)&&...); + } else { + // Preserve if leftmost SubRank-1 slices are full_extent_t and + // the slice at idx Subrank - 1 is a range and + // for idx > SubRank the slice is an index + return ((((Idx < SubRank - 1) && std::is_same_v<SliceSpecifiers, full_extent_t>) || + ((Idx == SubRank - 1) && is_range_slice_v<SliceSpecifiers, IndexType>) || + ((Idx > SubRank - 1) && is_index_slice_v<SliceSpecifiers, IndexType>)) && ...); + } +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); +#endif + } + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_left_padded_value() { + // Technically could also keep layout_left_padded for SubRank==0 + // and SubRank==1 with leftmost slice specifier being a contiguous range + // but we intercept these cases separately + + // In all other cases: + // leftmost slice must be range + // then there can be a gap with index slices + // then SubRank - 2 full_extent slices + // then another range slice + // then more index slices + // e.g. R I I I F F F R I I for obtaining a rank-5 from a rank-10 + return ((((Idx == 0) && is_range_slice_v<SliceSpecifiers, IndexType>) || + ((Idx > 0 && Idx <= gap_len) && is_index_slice_v<SliceSpecifiers, IndexType>) || + ((Idx > gap_len && Idx < gap_len + SubRank - 1) && std::is_same_v<SliceSpecifiers, full_extent_t>) || + ((Idx == gap_len + SubRank - 1) && is_range_slice_v<SliceSpecifiers, IndexType>) || + ((Idx > gap_len + SubRank - 1) && is_index_slice_v<SliceSpecifiers, IndexType>)) && ... ); + } +}; + +// We are reusing the same thing for layout_left and layout_left_padded +// For layout_left as source StaticStride is static_extent(0) +template<class Extents, size_t NumGaps, size_t StaticStride> +struct compute_s_static_layout_left { + // Neither StaticStride nor any of the provided extents can be zero. + // StaticStride can never be zero, the static_extents we are looking at are associated with + // integral slice specifiers - which wouldn't be valid for zero extent + template<size_t ... Idx> + MDSPAN_INLINE_FUNCTION + static constexpr size_t value(std::index_sequence<Idx...>) { + size_t val = ((Idx>0 && Idx<=NumGaps ? (Extents::static_extent(Idx) == dynamic_extent?0:Extents::static_extent(Idx)) : 1) * ... * (StaticStride == dynamic_extent?0:StaticStride)); + return val == 0?dynamic_extent:val; + } }; + } // namespace detail -// Suppress spurious warning with NVCC about no return statement. -// This is a known issue in NVCC and NVC++ -// Depending on the CUDA and GCC version we need both the builtin -// and the diagnostic push. I tried really hard to find something shorter -// but no luck ... -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic push - #pragma nv_diag_suppress = implicit_return_from_non_void_function - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic push - #pragma diag_suppress implicit_return_from_non_void_function - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic push - #pragma diag_suppress = implicit_return_from_non_void_function -#endif // Actual submdspan mapping call -template <class Extents, class... SliceSpecifiers> -MDSPAN_INLINE_FUNCTION -constexpr auto -submdspan_mapping(const layout_left::mapping<Extents> &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; +template <class Extents> +template <class... SliceSpecifiers> +MDSPAN_INLINE_FUNCTION constexpr auto +layout_left::mapping<Extents>::submdspan_mapping_impl( + SliceSpecifiers... slices) const { // compute sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // figure out sub layout type - constexpr bool preserve_layout = detail::preserve_layout_left_mapping< - decltype(std::make_index_sequence<src_ext_t::rank()>()), dst_ext_t::rank(), - SliceSpecifiers...>::value; - using dst_layout_t = - std::conditional_t<preserve_layout, layout_left, layout_stride>; - using dst_mapping_t = typename dst_layout_t::template mapping<dst_ext_t>; - - if constexpr (std::is_same_v<dst_layout_t, layout_left>) { + using deduce_layout = detail::deduce_layout_left_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + std::make_index_sequence<src_ext_t::rank()>, + SliceSpecifiers...>; + + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + const bool out_of_bounds = + detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast<size_t>( + out_of_bounds ? this->required_span_size() + : this->operator()(detail::first_of(slices)...)); + + if constexpr (deduce_layout::layout_left_value()) { // layout_left case - return mapping_offset<dst_mapping_t>{ - dst_mapping_t(dst_ext), - static_cast<size_t>(src_mapping(detail::first_of(slices)...))}; + using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t(dst_ext), + offset}; + } else if constexpr (deduce_layout::layout_left_padded_value()) { + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left<Extents, deduce_layout::gap_len, Extents::static_extent(0)>::value(std::make_index_sequence<Extents::rank()>()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<S_static>::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{ + dst_mapping_t(dst_ext, stride(1 + deduce_layout::gap_len)), offset}; } else { // layout_stride case - auto inv_map = detail::inv_map_rank( - std::integral_constant<size_t,0>(), - std::index_sequence<>(), - slices...); - return mapping_offset<dst_mapping_t>{ - dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, - // HIP needs deduction guides to have markups so we need to be explicit - // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue - #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) - std::tuple<decltype(detail::stride_of(slices))...>{detail::stride_of(slices)...})), - #else - std::tuple{detail::stride_of(slices)...})), - #endif - static_cast<size_t>(src_mapping(detail::first_of(slices)...))}; + using dst_mapping_t = typename layout_stride::mapping<dst_ext_t>; + auto inv_map = detail::inv_map_rank(std::integral_constant<size_t, 0>(), + std::index_sequence<>(), slices...); + return submdspan_mapping_result<dst_mapping_t> { + dst_mapping_t(mdspan_non_standard, dst_ext, + detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA altogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + detail::tuple<decltype(detail::stride_of(slices))...>{ + detail::stride_of(slices)...}).values), +#else + detail::tuple{detail::stride_of(slices)...}).values), +#endif + offset + }; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); #endif } -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic pop - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic pop - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic pop + +template <size_t PaddingValue> +template <class Extents> +template <class... SliceSpecifiers> +MDSPAN_INLINE_FUNCTION constexpr auto +MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<PaddingValue>::mapping<Extents>::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + + // compute sub extents + using src_ext_t = Extents; + auto dst_ext = submdspan_extents(extents(), slices...); + using dst_ext_t = decltype(dst_ext); + + if constexpr (Extents::rank() == 0) { // rank-0 case + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<PaddingValue>::template mapping<Extents>; + return submdspan_mapping_result<dst_mapping_t>{*this, 0}; + } else { + const bool out_of_bounds = + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast<size_t>( + out_of_bounds ? this->required_span_size() + : this->operator()(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::first_of(slices)...)); + if constexpr (dst_ext_t::rank() == 0) { // result rank-0 + // The following for some reasons leads to compiler error later, while not using a typedef works: + // Compilers: CUDA 11.2 with GCC 9.1 + // + // using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>; + // return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset}; + // + // Error: submdspan_mapping.hpp:299:23: error: 'dst_mapping_t' does not name a type + // 299 | using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>; + // The same error is given (about dst_mapping_t not naming type) when a different name is used in 299: + // using dst_mapping_t2 = typename layout_left::template mapping<dst_ext_t>; + + return submdspan_mapping_result<typename layout_left::template mapping<dst_ext_t>> + {typename layout_left::template mapping<dst_ext_t>{dst_ext}, offset}; + } else { // general case + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + // figure out sub layout type + using deduce_layout = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::deduce_layout_left_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + decltype(std::make_index_sequence<src_ext_t::rank()>()), + SliceSpecifiers...>; + + if constexpr (deduce_layout::layout_left_value() && dst_ext_t::rank() == 1) { // getting rank-1 from leftmost + using dst_mapping_t = typename layout_left::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset}; + } else if constexpr (deduce_layout::layout_left_padded_value()) { // can keep layout_left_padded + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left<Extents, deduce_layout::gap_len, static_padding_stride>::value(std::make_index_sequence<Extents::rank()>()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded<S_static>::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{ + dst_mapping_t(dst_ext, stride(1 + deduce_layout::gap_len)), offset}; + } else { // layout_stride + auto inv_map = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::inv_map_rank(std::integral_constant<size_t, 0>(), + std::index_sequence<>(), slices...); + using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t> { + dst_mapping_t(mdspan_non_standard, dst_ext, + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA alltogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices))...>{ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values), +#else + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values), +#endif + offset + }; + } + } + } + + +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); #endif +} //********************************** // layout_right submdspan_mapping @@ -164,136 +356,275 @@ submdspan_mapping(const layout_left::mapping<Extents> &src_mapping, namespace detail { // Figure out whether to preserve layout_right -template <class IndexSequence, size_t SubRank, class... SliceSpecifiers> -struct preserve_layout_right_mapping; +template <class IndexType, size_t SubRank, class IndexSequence, + class... SliceSpecifiers> +struct deduce_layout_right_submapping; -template <class... SliceSpecifiers, size_t... Idx, size_t SubRank> -struct preserve_layout_right_mapping<std::index_sequence<Idx...>, SubRank, - SliceSpecifiers...> { - constexpr static size_t SrcRank = sizeof...(SliceSpecifiers); - constexpr static bool value = - // Preserve layout for rank 0 - (SubRank == 0) || - ( - // The last subrank slice specifiers need to be full_extent_t - except - // for the srcrank-subrank one which could also be tuple but not a - // strided index range slice specifiers before srcrank-subrank are - // integrals - ((Idx < - SrcRank - SubRank) || // these are only integral slice specifiers - (std::is_same_v<SliceSpecifiers, full_extent_t>) || - ((Idx == SrcRank - SubRank) && - std::is_convertible_v<SliceSpecifiers, std::tuple<size_t, size_t>>)) && - ...); +template <class IndexType, size_t SubRank, size_t... Idx, + class... SliceSpecifiers> +struct deduce_layout_right_submapping< + IndexType, SubRank, std::index_sequence<Idx...>, SliceSpecifiers...> { + + static constexpr size_t Rank = sizeof...(Idx); + using count_range = index_sequence_scan_impl< + 0u, (std::is_convertible_v<SliceSpecifiers, IndexType> ? 0u : 1u)...>; + //__static_partial_sums<!std::is_convertible_v<SliceSpecifiers, + // IndexType>...>; + constexpr static int gap_len = + (((Idx < Rank - 1 && count_range::get(Idx) == SubRank - 1 && + std::is_convertible_v<SliceSpecifiers, IndexType>) + ? 1 + : 0) + + ... + 0); + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_right_value() { + // Use layout_right for rank 0 + if constexpr (SubRank == 0) { + return true; + // Use layout_right for rank 1 result if rightmost slice specifier is range like + } else if constexpr (SubRank == 1) { + return ((Idx < Rank - 1 || is_range_slice_v<SliceSpecifiers, IndexType>)&&...); + } else { + // Preserve if rightmost SubRank-1 slices are full_extent_t and + // the slice at idx Rank-Subrank is a range and + // for idx < Rank - SubRank the slice is an index + return ((((Idx >= Rank - SubRank) && std::is_same_v<SliceSpecifiers, full_extent_t>) || + ((Idx == Rank - SubRank) && is_range_slice_v<SliceSpecifiers, IndexType>) || + ((Idx < Rank - SubRank) && is_index_slice_v<SliceSpecifiers, IndexType>)) && ...); + } +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); +#endif + } + + MDSPAN_INLINE_FUNCTION + constexpr static bool layout_right_padded_value() { + // Technically could also keep layout_right_padded for SubRank==0 + // and SubRank==1 with rightmost slice specifier being a contiguous range + // but we intercept these cases separately + + // In all other cases: + // rightmost slice must be range + // then there can be a gap with index slices + // then SubRank - 2 full_extent slices + // then another range slice + // then more index slices + // e.g. I I R F F F I I I R for obtaining a rank-5 from a rank-10 + return ((((Idx == Rank - 1) && is_range_slice_v<SliceSpecifiers, IndexType>) || + ((Idx >= Rank - gap_len - 1 && Idx < Rank - 1) && is_index_slice_v<SliceSpecifiers, IndexType>) || + ((Idx > Rank - gap_len - SubRank && Idx < Rank - gap_len - 1) && std::is_same_v<SliceSpecifiers, full_extent_t>) || + ((Idx == Rank - gap_len - SubRank) && is_range_slice_v<SliceSpecifiers, IndexType>) || + ((Idx < Rank - gap_len - SubRank) && is_index_slice_v<SliceSpecifiers, IndexType>)) && ... ); + } +}; + +// We are reusing the same thing for layout_right and layout_right_padded +// For layout_right as source StaticStride is static_extent(Rank-1) +template<class Extents, size_t NumGaps, size_t StaticStride> +struct compute_s_static_layout_right { + // Neither StaticStride nor any of the provided extents can be zero. + // StaticStride can never be zero, the static_extents we are looking at are associated with + // integral slice specifiers - which wouldn't be valid for zero extent + template<size_t ... Idx> + MDSPAN_INLINE_FUNCTION + static constexpr size_t value(std::index_sequence<Idx...>) { + size_t val = ((Idx >= Extents::rank() - 1 - NumGaps && Idx < Extents::rank() - 1 ? (Extents::static_extent(Idx) == dynamic_extent?0:Extents::static_extent(Idx)) : 1) * ... * (StaticStride == dynamic_extent?0:StaticStride)); + return val == 0?dynamic_extent:val; + } }; + } // namespace detail -// Suppress spurious warning with NVCC about no return statement. -// This is a known issue in NVCC and NVC++ -// Depending on the CUDA and GCC version we need both the builtin -// and the diagnostic push. I tried really hard to find something shorter -// but no luck ... -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic push - #pragma nv_diag_suppress = implicit_return_from_non_void_function - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic push - #pragma diag_suppress implicit_return_from_non_void_function - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic push - #pragma diag_suppress = implicit_return_from_non_void_function -#endif -template <class Extents, class... SliceSpecifiers> -MDSPAN_INLINE_FUNCTION -constexpr auto -submdspan_mapping(const layout_right::mapping<Extents> &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - - // get sub extents +// Actual submdspan mapping call +template <class Extents> +template <class... SliceSpecifiers> +MDSPAN_INLINE_FUNCTION constexpr auto +layout_right::mapping<Extents>::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + + // compute sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); - // determine new layout type - constexpr bool preserve_layout = detail::preserve_layout_right_mapping< - decltype(std::make_index_sequence<src_ext_t::rank()>()), dst_ext_t::rank(), - SliceSpecifiers...>::value; - using dst_layout_t = - std::conditional_t<preserve_layout, layout_right, layout_stride>; - using dst_mapping_t = typename dst_layout_t::template mapping<dst_ext_t>; + // figure out sub layout type + using deduce_layout = detail::deduce_layout_right_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + std::make_index_sequence<src_ext_t::rank()>, + SliceSpecifiers...>; + + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + const bool out_of_bounds = + detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast<size_t>( + out_of_bounds ? this->required_span_size() + : this->operator()(detail::first_of(slices)...)); - if constexpr (std::is_same_v<dst_layout_t, layout_right>) { + if constexpr (deduce_layout::layout_right_value()) { // layout_right case - return mapping_offset<dst_mapping_t>{ - dst_mapping_t(dst_ext), - static_cast<size_t>(src_mapping(detail::first_of(slices)...))}; + using dst_mapping_t = typename layout_right::mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t(dst_ext), + offset}; + } else if constexpr (deduce_layout::layout_right_padded_value()) { + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_left<Extents, deduce_layout::gap_len, Extents::static_extent(Extents::rank() - 1)>::value(std::make_index_sequence<Extents::rank()>()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<S_static>::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{ + dst_mapping_t(dst_ext, + stride(src_ext_t::rank() - 2 - deduce_layout::gap_len)), + offset}; } else { // layout_stride case - auto inv_map = detail::inv_map_rank( - std::integral_constant<size_t,0>(), - std::index_sequence<>(), - slices...); - return mapping_offset<dst_mapping_t>{ - dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, - // HIP needs deduction guides to have markups so we need to be explicit - // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue - #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) - std::tuple<decltype(detail::stride_of(slices))...>{detail::stride_of(slices)...})), - #else - std::tuple{detail::stride_of(slices)...})), - #endif - static_cast<size_t>(src_mapping(detail::first_of(slices)...))}; + using dst_mapping_t = typename layout_stride::mapping<dst_ext_t>; + auto inv_map = detail::inv_map_rank(std::integral_constant<size_t, 0>(), + std::index_sequence<>(), slices...); + return submdspan_mapping_result<dst_mapping_t> { + dst_mapping_t(mdspan_non_standard, dst_ext, + detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA altogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(detail::stride_of(slices))...>{ + detail::stride_of(slices)...}).values), +#else + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{detail::stride_of(slices)...}).values), +#endif + offset + }; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); #endif } -#if defined __NVCC__ - #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ - #pragma nv_diagnostic pop - #else - #ifdef __CUDA_ARCH__ - #pragma diagnostic pop - #endif - #endif -#elif defined __NVCOMPILER - #pragma diagnostic pop + +template <size_t PaddingValue> +template <class Extents> +template <class... SliceSpecifiers> +MDSPAN_INLINE_FUNCTION constexpr auto +MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<PaddingValue>::mapping<Extents>::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + + // compute sub extents + using src_ext_t = Extents; + auto dst_ext = submdspan_extents(extents(), slices...); + using dst_ext_t = decltype(dst_ext); + + if constexpr (Extents::rank() == 0) { // rank-0 case + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<PaddingValue>::template mapping<Extents>; + return submdspan_mapping_result<dst_mapping_t>{*this, 0}; + } else { + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + // figure out sub layout type + const bool out_of_bounds = + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast<size_t>( + out_of_bounds ? this->required_span_size() + : this->operator()(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::first_of(slices)...)); + if constexpr (dst_ext_t::rank() == 0) { // result rank-0 + // Same issue as in layout_left_padded: see comment there + // using dst_mapping_t = typename layout_right::template mapping<dst_ext_t>; + // return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset}; + return submdspan_mapping_result<typename layout_right::template mapping<dst_ext_t>> + {typename layout_right::template mapping<dst_ext_t>{dst_ext}, offset}; + } else { // general case + using deduce_layout = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::deduce_layout_right_submapping< + typename dst_ext_t::index_type, dst_ext_t::rank(), + decltype(std::make_index_sequence<src_ext_t::rank()>()), + SliceSpecifiers...>; + + if constexpr (deduce_layout::layout_right_value() && dst_ext_t::rank() == 1) { // getting rank-1 from rightmost + using dst_mapping_t = typename layout_right::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{dst_mapping_t{dst_ext}, offset}; + } else if constexpr (deduce_layout::layout_right_padded_value()) { // can keep layout_right_padded + constexpr size_t S_static = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::compute_s_static_layout_right<Extents, deduce_layout::gap_len, static_padding_stride>::value(std::make_index_sequence<Extents::rank()>()); + using dst_mapping_t = typename MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded<S_static>::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t>{ + dst_mapping_t(dst_ext, stride(Extents::rank() - 2 - deduce_layout::gap_len)), offset}; + } else { // layout_stride + auto inv_map = MDSPAN_IMPL_STANDARD_NAMESPACE::detail::inv_map_rank(std::integral_constant<size_t, 0>(), + std::index_sequence<>(), slices...); + using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>; + return submdspan_mapping_result<dst_mapping_t> { + dst_mapping_t(mdspan_non_standard, dst_ext, + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA alltogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices))...>{ + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values), +#else + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{MDSPAN_IMPL_STANDARD_NAMESPACE::detail::stride_of(slices)...}).values), +#endif + offset + }; + } + } + } + + +#if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) + __builtin_unreachable(); #endif +} //********************************** // layout_stride submdspan_mapping //********************************* -template <class Extents, class... SliceSpecifiers> -MDSPAN_INLINE_FUNCTION -constexpr auto -submdspan_mapping(const layout_stride::mapping<Extents> &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); +template <class Extents> +template <class... SliceSpecifiers> +MDSPAN_INLINE_FUNCTION constexpr auto +layout_stride::mapping<Extents>::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); - auto inv_map = detail::inv_map_rank( - std::integral_constant<size_t,0>(), - std::index_sequence<>(), - slices...); + auto inv_map = detail::inv_map_rank(std::integral_constant<size_t, 0>(), + std::index_sequence<>(), slices...); using dst_mapping_t = typename layout_stride::template mapping<dst_ext_t>; - return mapping_offset<dst_mapping_t>{ - dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, - // HIP needs deduction guides to have markups so we need to be explicit - // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue - #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) - std::tuple<decltype(detail::stride_of(slices))...>(detail::stride_of(slices)...))), + + // Figure out if any slice's lower bound equals the corresponding extent. + // If so, bypass evaluating the layout mapping. This fixes LWG Issue 4060. + const bool out_of_bounds = + detail::any_slice_out_of_bounds(this->extents(), slices...); + auto offset = static_cast<size_t>( + out_of_bounds ? this->required_span_size() + : this->operator()(detail::first_of(slices)...)); + + return submdspan_mapping_result<dst_mapping_t> { + dst_mapping_t(mdspan_non_standard, dst_ext, + detail::construct_sub_strides( + *this, inv_map, +// HIP needs deduction guides to have markups so we need to be explicit +// NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA alltogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple<decltype(detail::stride_of(slices))...>( + detail::stride_of(slices)...)).values), #else - std::tuple(detail::stride_of(slices)...))), + MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple(detail::stride_of(slices)...)).values), #endif - static_cast<size_t>(src_mapping(detail::first_of(slices)...))}; + offset + }; } + } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE + +#if defined __NVCC__ +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diagnostic pop +#else +#ifdef __CUDA_ARCH__ +#pragma diagnostic pop +#endif +#endif +#elif defined __NVCOMPILER +#pragma diagnostic pop +#endif diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a714090e4483b8aeecfca0112f1458e5c98d07e9 --- /dev/null +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -0,0 +1,869 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include <cassert> +#include "layout_padded_fwd.hpp" +#include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/extents.hpp" +#include "../__p0009_bits/mdspan.hpp" +#include "../__p0009_bits/layout_left.hpp" +#include "../__p0009_bits/layout_right.hpp" +#include "../__p0009_bits/layout_stride.hpp" +#include "../__p0009_bits/utility.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +namespace detail { +template<class _T> +MDSPAN_INLINE_FUNCTION +constexpr _T +find_next_multiple(_T alignment, _T offset) +{ + if ( alignment == 0 ) { + return _T(0); + } else { + return ( ( offset + alignment - 1 ) / alignment) * alignment; + } +} + +template <class _ExtentsType, size_t _PaddingValue, size_t _ExtentToPadIdx> +MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() { + constexpr auto rank = _ExtentsType::rank(); + + if constexpr (rank <= typename _ExtentsType::rank_type(1)) { + return 0; + } else if constexpr (_PaddingValue != dynamic_extent && + _ExtentsType::static_extent(_ExtentToPadIdx) != + dynamic_extent) { + static_assert( + (_PaddingValue != 0) || + (_ExtentsType::static_extent(_ExtentToPadIdx) == 0), + "padding stride can be 0 only if " + "extents_type::static_extent(extent-to-pad) is 0 or dynamic_extent"); + return find_next_multiple(_PaddingValue, + _ExtentsType::static_extent(_ExtentToPadIdx)); + } else { + return dynamic_extent; + } + // Missing return statement warning from NVCC and ICC +#if (defined(__NVCC__) || defined(__INTEL_COMPILER)) && !defined(__NVCOMPILER) + return 0; +#endif +} + +template <size_t _PaddingValue, typename _Extents, size_t _ExtentToPadIdx, size_t _Rank, typename Enabled = void> +struct static_array_type_for_padded_extent +{ + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, + ::MDSPAN_IMPL_STANDARD_NAMESPACE::MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::get_actual_static_padding_value<extents_type, _PaddingValue, + _ExtentToPadIdx>()>; +}; + +template <size_t _PaddingValue, typename _Extents, size_t _ExtentToPadIdx, size_t Rank> +struct static_array_type_for_padded_extent<_PaddingValue, _Extents, + _ExtentToPadIdx, Rank, std::enable_if_t<Rank <= 1>> { + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, 0>; +}; + +template <size_t _PaddingValue, typename _Extents, size_t _ExtentToPadIdx> +struct padded_extent { + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using static_array_type = typename static_array_type_for_padded_extent< + padding_value, _Extents, _ExtentToPadIdx, _Extents::rank()>::type; + + MDSPAN_INLINE_FUNCTION + static constexpr auto static_value() { return static_array_type::static_value(0); } + + MDSPAN_INLINE_FUNCTION + static constexpr static_array_type + init_padding(const _Extents &exts) { + if constexpr ((_Extents::rank() > 1) && (padding_value == dynamic_extent)) { + return {exts.extent(_ExtentToPadIdx)}; + } else { + return init_padding(exts, padding_value); + } + // Missing return statement warning from NVCC and ICC +#if (defined(__NVCC__) || defined(__INTEL_COMPILER)) && !defined(__NVCOMPILER) + return {}; +#endif + } + + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Extents &exts, + [[maybe_unused]] index_type pv) { + if constexpr (_Extents::rank() > 1) { + return {find_next_multiple(pv, + exts.extent(_ExtentToPadIdx))}; + } else { + return {}; + } + // Missing return statement warning from NVCC and ICC +#if (defined(__NVCC__) || defined(__INTEL_COMPILER)) && !defined(__NVCOMPILER) + return {}; +#endif + } + + template <typename _Mapping, size_t _PaddingStrideIdx> + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Mapping &other_mapping, + std::integral_constant<size_t, _PaddingStrideIdx>) { + if constexpr (_Extents::rank() > 1) { + return {other_mapping.stride(_PaddingStrideIdx)}; + } else { + return {}; + } + // Missing return statement warning from NVCC and ICC +#if (defined(__NVCC__) || defined(__INTEL_COMPILER)) && !defined(__NVCOMPILER) + return {}; +#endif + } +}; +} // namespace detail + +template <size_t PaddingValue> +template <class Extents> +class layout_left_padded<PaddingValue>::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_left_padded<padding_value>; + +#ifndef MDSPAN_INTERNAL_TEST +private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants<layout_type, extents_type>::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants<layout_type, extents_type>::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "out of bounds access for rank 0"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { + return 0; + } + + template <size_t Rank, class IndexOffset> + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<Rank>, IndexOffset index_offset) const { + return index_offset; + } + + template <size_t... Ranks, class... IndexOffsets> + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<Ranks...>, + IndexOffsets... index_offsets) const { + index_type indices[] = {static_cast<index_type>(index_offsets)...}; + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 + index_type res = 0; + ((res = indices[extents_type::rank() - 1 - Ranks] + + ((extents_type::rank() - 1 - Ranks) == extent_to_pad_idx + ? padded_stride.value(0) + : exts.extent(extents_type::rank() - 1 - Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 || defined(__NVCC__) + MDSPAN_INLINE_FUNCTION + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type& ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) + {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v<Size, index_type>` + * is `true` and `is_nothrow_constructible_v<index_type, Size>` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v<index_type, _Size> + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, dynamic_padding_value)), exts(ext) + { + assert((padding_value == dynamic_extent) || (static_cast<index_type>(padding_value) == static_cast<index_type>(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_left::mapping`. + * + * This overload participates in overload resolution only if + * `is_constructible_v<extents_type, OtherExtents>` is true. If + * `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, + * or `OtherExtents::static_extent(0)` must be `dynamic_extent`; otherwise, + * `OtherExtents::static_extent(0)` must be equal to the least multiple of + * `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ (std::is_constructible_v<extents_type, _OtherExtents>)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant<size_t, padded_stride_idx>{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (static_padding_stride != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (static_padding_stride == + _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if + * `is_constructible_v<extents_type, OtherExtents>` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ (std::is_constructible_v<extents_type, _OtherExtents>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant<size_t, padded_stride_idx>{})), + exts(other_mapping.extents()) {} + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if + * `is_constructible_v<extents_type, OtherExtents>` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant<size_t, padded_stride_idx>{})), + exts(other_mapping.extents()) { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v<extents_type, + * OtherExtents>` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v<extents_type, + typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<typename _Mapping::extents_type, extents_type>)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding( + static_cast<extents_type>(other_mapping.extents()), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} + + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { + return exts; + } + + constexpr std::array<index_type, extents_type::rank()> + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { + return {}; + } else if constexpr (extents_type::rank() == 1) { + return {1}; + } else { + index_type value = 1; + std::array<index_type, extents_type::rank()> s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; + ++r) { + s[r] = value; + value *= exts.extent(r); + } + s[extents_type::rank() - 1] = value; + return s; + } + } + + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { + return 1; + } else if constexpr (extents_type::rank() == 1) { + return exts.extent(0); + } else { + index_type value = padded_stride.value(0); + for (rank_type r = 1; r < extents_type::rank(); ++r) { + value *= exts.extent(r); + } + return value; + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v<Indices, index_type> && ...) is true`, and + * - (is_nothrow_constructible_v<index_type, Indices> && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices<index_type, _Indices...>()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { +#if !defined(NDEBUG) + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::check_all_indices(this->extents(), + idxs...); +#endif // ! NDEBUG + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; + } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; + } + + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { + assert(r < extents_type::rank()); + if (r == 0) + return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = 1; k < r; k++) + value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_left_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { + return !(left == right); + } +#endif + + // [mdspan.submdspan.mapping], submdspan mapping specialization + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } +}; + +template <size_t PaddingValue> +template <class Extents> +class layout_right_padded<PaddingValue>::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_right_padded<padding_value>; + +#ifndef MDSPAN_INTERNAL_TEST + private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants<layout_type, extents_type>::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants<layout_type, extents_type>::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "if padding stride is 0, static_extent(extent-to-pad-rank) must also be 0 or dynamic_extent"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<>) const { + return 0; + } + + template <size_t Rank, class IndexOffset> + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<Rank>, IndexOffset index_offset) const { + return index_offset; + } + + template <size_t... Ranks, class... IndexOffsets> + MDSPAN_INLINE_FUNCTION constexpr index_type + compute_offset(std::index_sequence<Ranks...>, + IndexOffsets... index_offsets) const { + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 + index_type res = 0; + ((res = static_cast<index_type>(index_offsets) + + (Ranks == extent_to_pad_idx ? padded_stride.value(0) + : exts.extent(Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 || defined(__NVCC__) + MDSPAN_INLINE_FUNCTION + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v<Size, index_type>` + * is `true` and `is_nothrow_constructible_v<index_type, Size>` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v<index_type, _Size> + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, static_cast<index_type>(dynamic_padding_value))), + exts(ext) { + assert((padding_value == dynamic_extent) || + (static_cast<index_type>(padding_value) == static_cast<index_type>(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_right::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v<extents_type, OtherExtents>` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ (std::is_constructible_v<extents_type, _OtherExtents>)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<_OtherExtents, extents_type>)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant<size_t, padded_stride_idx>{})), + exts(other_mapping.extents()) { + static_assert( + (_OtherExtents::rank() > 1) || + (padded_stride_type::static_value() != dynamic_extent) || + (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) || + (padded_stride_type::static_value() == + _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if + * `is_constructible_v<extents_type, OtherExtents>` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ (std::is_constructible_v<extents_type, _OtherExtents>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant<size_t, padded_stride_idx>{})), + exts(other_mapping.extents()) {} + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if + * `is_constructible_v<extents_type, OtherExtents>` is true. Either + * `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or + * `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value + &&std::is_constructible_v< + extents_type, typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding( + other_mapping, + std::integral_constant<size_t, padded_stride_idx>{})), + exts(other_mapping.extents()) { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if + * `extents_type::rank()` is 0 or 1 and `is_constructible_v<extents_type, + * OtherExtents>` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_left_padded_mapping<_Mapping>::value + &&extents_type::rank() <= 1 && + std::is_constructible_v<extents_type, + typename _Mapping::extents_type>)) + MDSPAN_CONDITIONAL_EXPLICIT( + (!std::is_convertible_v<typename _Mapping::extents_type, extents_type>)) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding( + static_cast<extents_type>(other_mapping.extents()), + other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) {} + + MDSPAN_INLINE_FUNCTION constexpr const extents_type & + extents() const noexcept { + return exts; + } + + constexpr std::array<index_type, extents_type::rank()> + strides() const noexcept { + if constexpr (extents_type::rank() == 0) { + return {}; + } else if constexpr (extents_type::rank() == 1) { + return {1}; + } else { + index_type value = 1; + std::array<index_type, extents_type::rank()> s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) { + s[r] = value; + value *= exts.extent(r); + } + s[0] = value; + return s; + } + } + + MDSPAN_INLINE_FUNCTION constexpr index_type + required_span_size() const noexcept { + if constexpr (extents_type::rank() == 0) { + return 1; + } else if constexpr (extents_type::rank() == 1) { + return exts.extent(0); + } else { + index_type value = 1; + for (rank_type r = 0; r < extent_to_pad_idx; ++r) { + value *= exts.extent(r); + } + return value * padded_stride.value(0); + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v<Indices, index_type> && ...) is true`, and + * - (is_nothrow_constructible_v<index_type, Indices> && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ (sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail:: + are_valid_indices<index_type, _Indices...>()))) + MDSPAN_INLINE_FUNCTION constexpr size_t + operator()(_Indices... idxs) const noexcept { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return (extents_type::rank() <= rank_type(1)) || + (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent && + extents_type::static_extent(extent_to_pad_idx) == + padded_stride_type::static_value()); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { + return true; + } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { + return true; + } + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { + return (extents_type::rank() < 2) || + (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { + return true; + } + + MDSPAN_INLINE_FUNCTION constexpr index_type + stride(rank_type r) const noexcept { + assert(r < extents_type::rank()); + if (r == extents_type::rank() - 1) + return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = extents_type::rank() - 2; k > r; k--) + value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is + * specified as taking `layout_right_padded< padding_value >::mapping< + * Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator==(const mapping &left, const _Mapping &right) noexcept { + // Workaround for some compilers not short-circuiting properly with + // compile-time checks i.e. we can't access stride(_padding_stride_idx) of a + // rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) { + strides_equal = + left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if + * `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ (detail::is_layout_right_padded_mapping<_Mapping>::value && + (_Mapping::extents_type::rank() == extents_type::rank()))) + MDSPAN_INLINE_FUNCTION friend constexpr bool + operator!=(const mapping &left, const _Mapping &right) noexcept { + return !(left == right); + } +#endif + + // [mdspan.submdspan.mapping], submdspan mapping specialization + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template<class... SliceSpecifiers> + MDSPAN_INLINE_FUNCTION + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } +}; +} +} diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3f141ff08aaddf8604fd2a1ff0a3450038cbec52 --- /dev/null +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp @@ -0,0 +1,137 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include <cassert> +#include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/utility.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +template <size_t padding_value = dynamic_extent> +struct layout_left_padded { + template <class _Extents> + class mapping; +}; + +template <size_t padding_value = dynamic_extent> +struct layout_right_padded { + template <class _Extents> + class mapping; +}; + +namespace detail { +// The layout_padded_constants structs are only useful if rank > 1, otherwise they may wrap +template <class _Layout, class _ExtentsType> +struct layout_padded_constants; + +template <class _ExtentsType, size_t _PaddingStride> +struct layout_padded_constants<layout_left_padded<_PaddingStride>, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = 1; + static constexpr rank_type extent_to_pad_idx = 0; +}; + +template <class _ExtentsType, size_t _PaddingStride> +struct layout_padded_constants<layout_right_padded<_PaddingStride>, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = _ExtentsType::rank() - 2; + static constexpr rank_type extent_to_pad_idx = _ExtentsType::rank() - 1; +}; + +template <class _Layout> +struct is_layout_left_padded : std::false_type {}; + +template <size_t _PaddingStride> +struct is_layout_left_padded<layout_left_padded<_PaddingStride>> : std::true_type {}; + +template <class _Mapping, class _Enabled = void> +struct is_layout_left_padded_mapping : std::false_type {}; + +template <class _Mapping> +struct is_layout_left_padded_mapping<_Mapping, + std::enable_if_t<std::is_same<_Mapping, typename layout_left_padded<_Mapping::padding_value>::template mapping<typename _Mapping::extents_type>>::value>> + : std::true_type {}; + +template <class _Layout> +struct is_layout_right_padded : std::false_type {}; + +template <size_t _PaddingStride> +struct is_layout_right_padded<layout_right_padded<_PaddingStride>> : std::true_type {}; + +template <class _Mapping, class _Enabled = void> +struct is_layout_right_padded_mapping : std::false_type {}; + +template <class _Mapping> +struct is_layout_right_padded_mapping<_Mapping, + std::enable_if_t<std::is_same<_Mapping, typename layout_right_padded<_Mapping::padding_value>::template mapping<typename _Mapping::extents_type>>::value>> + : std::true_type {}; + + +template <class _LayoutExtentsType, class _PaddedLayoutMappingType> +MDSPAN_INLINE_FUNCTION +constexpr void check_padded_layout_converting_constructor_mandates(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<0>) {} + +template <class _LayoutExtentsType, class _PaddedLayoutMappingType> +MDSPAN_INLINE_FUNCTION +constexpr void check_padded_layout_converting_constructor_mandates(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<1>) {} + +template <class _LayoutExtentsType, class _PaddedLayoutMappingType, std::size_t N> +MDSPAN_INLINE_FUNCTION +constexpr void check_padded_layout_converting_constructor_mandates(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<N>) +{ + using extents_type = typename _PaddedLayoutMappingType::extents_type; + constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; + constexpr auto idx = layout_padded_constants<typename _PaddedLayoutMappingType::layout_type, _LayoutExtentsType >::extent_to_pad_idx; + + constexpr auto statically_determinable = + (_LayoutExtentsType::static_extent(idx) != dynamic_extent) && + (extents_type::static_extent(idx) != dynamic_extent) && + (padding_value != dynamic_extent); + + static_assert(!statically_determinable || + (padding_value == 0 + ? _LayoutExtentsType::static_extent(idx) == 0 + : _LayoutExtentsType::static_extent(idx) % padding_value == 0), + ""); +} + +template <typename _ExtentsType, typename _OtherMapping> +MDSPAN_INLINE_FUNCTION +constexpr void check_padded_layout_converting_constructor_preconditions(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<0>, + const _OtherMapping&) {} +template <typename _ExtentsType, typename _OtherMapping> +MDSPAN_INLINE_FUNCTION +constexpr void check_padded_layout_converting_constructor_preconditions(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<1>, + const _OtherMapping&) {} +template <typename _ExtentsType, typename _OtherMapping, std::size_t N> +MDSPAN_INLINE_FUNCTION +constexpr void check_padded_layout_converting_constructor_preconditions(MDSPAN_IMPL_STANDARD_NAMESPACE::detail::with_rank<N>, + const _OtherMapping &other_mapping) { + constexpr auto padded_stride_idx = + layout_padded_constants<typename _OtherMapping::layout_type, + _ExtentsType>::padded_stride_idx; + constexpr auto extent_to_pad_idx = layout_padded_constants<typename _OtherMapping::layout_type, _ExtentsType>::extent_to_pad_idx; + MDSPAN_IMPL_PRECONDITION(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); +} + + +} +} +} diff --git a/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp b/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp index b440873526ab488857766d8e893343f951920ccc..4a0e354ffd02183e0c738c9c716c1228b3b180bc 100644 --- a/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp +++ b/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp @@ -35,7 +35,9 @@ #include "../experimental/__p0009_bits/layout_right.hpp" #include "../experimental/__p0009_bits/macros.hpp" #if MDSPAN_HAS_CXX_17 +#include "../experimental/__p2642_bits/layout_padded.hpp" #include "../experimental/__p2630_bits/submdspan.hpp" #endif +#include "../experimental/__p2389_bits/dims.hpp" #endif // MDSPAN_HPP_