diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000..2488790254 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,84 @@ +name: github-DOCS + +on: + pull_request: + branches: + - master + - develop + +permissions: + contents: none + +jobs: + docs-check: + runs-on: ubuntu-latest + steps: + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends doxygen-latex + pip install sphinx + pip install breathe + pip install sphinx-rtd-theme + + - name: checkout_kokkos_kernels + uses: actions/checkout@v3 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v3 + with: + repository: kokkos/kokkos + ref: develop + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_CXX_FLAGS="-Werror" \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j2 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib/cmake/Kokkos \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkosKernels_ENABLE_DOCS=ON \ + .. + + - name: build_kokkos_kernels_doxygen + working-directory: kokkos-kernels/build + run: | + echo "Redirecting full output to doxygen.out..." + make Doxygen > doxygen.out 2>&1 || true + error_ret=$(grep 'Error' doxygen.out | head -c 1) || true + if [ ! -z $error_ret ]; then + echo "---- BEGIN: Summary of errors ---- " + cat doxygen.out | grep -i 'error:' || true + echo "---- END: Summary of errors ---- " + echo + echo + echo "---- BEGIN: Summary of warnings ---- " + cat doxygen.out | grep -i 'warning:' || true + echo "---- END: Summary of warnings ---- " + exit 1 + fi + + - name: build_kokkos_kernels_sphinx + working-directory: kokkos-kernels/build + run: make Sphinx diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index df1df44ad7..220461fe62 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install Dependencies run: sudo apt install clang-format-8 @@ -31,9 +31,18 @@ jobs: fi done - # If any diffs exist, error out + # If any diffs exist, print the patch and error out if [[ ! -z $(git status -s -uno . -- ':!.github') ]]; then echo "The following files require formatting changes:" git status -s -uno . -- ':!.github' + + echo "==== Begin Format Patch ====" + # --cached means show staged changes (git add above) + git --no-pager diff --patch --cached + echo "==== End Format Patch ====" + + echo "To automate formatting, see:" + echo " https://kokkos-kernels.readthedocs.io/en/latest/developer/style.html#id1" + exit 1 fi diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index b33c27f7f7..8a5681f9c7 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: kokkos/kokkos ref: ${{ github.base_ref }} @@ -72,6 +72,8 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. diff --git a/BUILD.md b/BUILD.md index 6c91042b78..5be269bd7c 100644 --- a/BUILD.md +++ b/BUILD.md @@ -192,7 +192,7 @@ endif() * Whether to pre instantiate kernels for the scalar type double. This option is KokkosKernels_INST_DOUBLE=ON by default. Disabling this may increase build times. * Default: ON * KokkosKernels_INST_EXECSPACE_OPENMP: BOOL - * Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OpenMP is enabled may increase build times. + * Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OPENMP is enabled may increase build times. * Default: ON if Kokkos is OpenMP-enabled, OFF otherwise. * KokkosKernels_INST_EXECSPACE_SERIAL: BOOL * Whether to build kernels for the execution space Kokkos::Serial. If explicit template instantiation (ETI) is enabled in Trilinos, disabling this when Kokkos_ENABLE_SERIAL is enabled may increase build times. diff --git a/CHANGELOG.md b/CHANGELOG.md index ac3b708fb2..91268a35fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,145 @@ # Change Log +## [4.1.00](https://github.com/kokkos/kokkos-kernels/tree/4.1.00) (2023-06-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.01...4.1.00) + +### New Features + +#### BLAS updates +- Adding interface with execution space instance argument to support execution of BLAS on stream + - Norms on stream [\#1795](https://github.com/kokkos/kokkos-kernels/pull/1795) + - Blas1 on stream [\#1803](https://github.com/kokkos/kokkos-kernels/pull/1803) + - Blas2 and 3 on stream [\#1812](https://github.com/kokkos/kokkos-kernels/pull/1812) +- Improving BLAS level 2 support by adding native implementation and TPL for GER, HER and SYR + - Implementation for BLAS2 ger [\#1756](https://github.com/kokkos/kokkos-kernels/pull/1756) + - Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### Batched updates +- Optimizing algorithms for single input data + - Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 [\#1753](https://github.com/kokkos/kokkos-kernels/pull/1753) + - Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 [\#1770](https://github.com/kokkos/kokkos-kernels/pull/1770) + +#### Sparse updates +- Adding stream support to ILUK/SPTRSV and sort/merge + - Streams interface for SPILUK numeric [\#1728](https://github.com/kokkos/kokkos-kernels/pull/1728) + - Stream interface for SPTRSV solve [\#1820](https://github.com/kokkos/kokkos-kernels/pull/1820) + - Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) +- Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests [\#1769](https://github.com/kokkos/kokkos-kernels/pull/1769) +- sparse: Add coo2crs, crs2coo and CooMatrix [\#1686](https://github.com/kokkos/kokkos-kernels/pull/1686) +- Adds team- and thread-based lower-bound and upper-bound search and predicates [\#1711](https://github.com/kokkos/kokkos-kernels/pull/1711) +- Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset [\#1710](https://github.com/kokkos/kokkos-kernels/pull/1710) + +#### Misc updates +- ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) + +### Enhancements: + +#### BLAS +- refactor blas3 tests to use benchmark library [\#1751](https://github.com/kokkos/kokkos-kernels/pull/1751) + +#### Batched +- batched/eti: ETI host-level interfaces [\#1783](https://github.com/kokkos/kokkos-kernels/pull/1783) +- batched/dense: Add gesv DynRankView runtime checks [\#1850](https://github.com/kokkos/kokkos-kernels/pull/1850) + +#### Sparse +- Add support for complex data types in MDF [\#1776](https://github.com/kokkos/kokkos-kernels/pull/1776) +- Sort and merge improvements [\#1773](https://github.com/kokkos/kokkos-kernels/pull/1773) +- spgemm handle: check that A,B,C graphs never change [\#1742](https://github.com/kokkos/kokkos-kernels/pull/1742) +- Fix/enhance backend issues on spadd perftest [\#1672](https://github.com/kokkos/kokkos-kernels/pull/1672) +- Spgemm perf test enhancements [\#1664](https://github.com/kokkos/kokkos-kernels/pull/1664) +- add explicit tests of opt-in algorithms in SpMV [\#1712](https://github.com/kokkos/kokkos-kernels/pull/1712) + +#### Common utilities +- Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) +- Add basis skeleton for KokkosKernels::print_configuration [\#1665](https://github.com/kokkos/kokkos-kernels/pull/1665) +- Add git information to benchmark context [\#1722](https://github.com/kokkos/kokkos-kernels/pull/1722) +- Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) +- PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) + +#### TPL support +- Refactor MKL TPL for both CPU and GPU usage [\#1779](https://github.com/kokkos/kokkos-kernels/pull/1779) +- MKL: support indices properly [\#1868](https://github.com/kokkos/kokkos-kernels/pull/1868) +- Use rocsparse_spmv_ex for rocm >= 5.4.0 [\#1701](https://github.com/kokkos/kokkos-kernels/pull/1701) + + +### Build System: +- Do not change memory spaces instantiation defaults based on Kokkos_ENABLE_CUDA_UVM [\#1835](https://github.com/kokkos/kokkos-kernels/pull/1835) +- KokkosKernels: Remove TriBITS Kokkos subpackages (trilinos/Trilinos#11545) [\#1817](https://github.com/kokkos/kokkos-kernels/pull/1817) +- CMakeLists.txt: Add alias to match what is exported from Trilinos [\#1855](https://github.com/kokkos/kokkos-kernels/pull/1855) +- KokkosKernels: Don't list include for non-existant 'batched' build dir (trilinos/Trilinos#11966) [\#1867](https://github.com/kokkos/kokkos-kernels/pull/1867) +- Remove non-existant subdir kokkos-kernels/common/common (#11921, #11863) [\#1854](https://github.com/kokkos/kokkos-kernels/pull/1854) +- KokkosKernels: Remove non-existent common/src/[impl,tpls] include dirs (trilinos/Trilinos#11545) [\#1844](https://github.com/kokkos/kokkos-kernels/pull/1844) + +### Documentation and Testing: +- Enable sphinx werror [\#1856](https://github.com/kokkos/kokkos-kernels/pull/1856) +- Update cmake option naming in docs/comments [\#1849](https://github.com/kokkos/kokkos-kernels/pull/1849) +- docs/developer: Add Experimental namespace [\#1852](https://github.com/kokkos/kokkos-kernels/pull/1852) +- docs: Add profiling for compile times [\#1843](https://github.com/kokkos/kokkos-kernels/pull/1843) +- Ger: adding documentation stubs in apidocs [\#1822](https://github.com/kokkos/kokkos-kernels/pull/1822) +- .github/workflows: Summarize github-DOCS errors and warnings [\#1814](https://github.com/kokkos/kokkos-kernels/pull/1814) +- Blas1: docs update for PR #1803 [\#1805](https://github.com/kokkos/kokkos-kernels/pull/1805) +- apt-get update in hosted runner docs check [\#1797](https://github.com/kokkos/kokkos-kernels/pull/1797) +- scripts: Fix github-DOCS [\#1796](https://github.com/kokkos/kokkos-kernels/pull/1796) +- Add --enable-docs option to cm_generate_makefile [\#1785](https://github.com/kokkos/kokkos-kernels/pull/1785) +- docs: Add stubs for some sparse APIs [\#1768](https://github.com/kokkos/kokkos-kernels/pull/1768) +- .github: Update to actions/checkout@v3 [\#1767](https://github.com/kokkos/kokkos-kernels/pull/1767) +- docs: Include BatchedGemm [\#1765](https://github.com/kokkos/kokkos-kernels/pull/1765) +- .github: Automation reminder [\#1726](https://github.com/kokkos/kokkos-kernels/pull/1726) +- Allow an HTML-only docs build [\#1723](https://github.com/kokkos/kokkos-kernels/pull/1723) +- SYCL CI: Specify the full path to the compiler [\#1670](https://github.com/kokkos/kokkos-kernels/pull/1670) +- Add github DOCS ci check & disable Kokkos tests [\#1647](https://github.com/kokkos/kokkos-kernels/pull/1647) +- Add rocsparse,rocblas, to enabled TPLs in cm_test_all_sandia when --spot-check-tpls [\#1841](https://github.com/kokkos/kokkos-kernels/pull/1841) +- cm_test_all_sandia: update to add caraway queues for MI210, MI250 [\#1840](https://github.com/kokkos/kokkos-kernels/pull/1840) +- Support rocSparse in rocm 5.2.0 [\#1833](https://github.com/kokkos/kokkos-kernels/pull/1833) +- Add KokkosKernels_PullRequest_VEGA908_Tpls_ROCM520 support, only enable KokkosBlas::gesv where supported [\#1816](https://github.com/kokkos/kokkos-kernels/pull/1816) +- scripts: Include OMP settings [\#1801](https://github.com/kokkos/kokkos-kernels/pull/1801) +- Print the patch that clang-format-8 wants to apply [\#1714](https://github.com/kokkos/kokkos-kernels/pull/1714) + +### Benchmarks: +- Benchmark cleanup for par_ilut and spmv [\#1853](https://github.com/kokkos/kokkos-kernels/pull/1853) +- SpMV: adding benchmark for spmv [\#1821](https://github.com/kokkos/kokkos-kernels/pull/1821) +- New performance test for par_ilut, ginkgo::par_ilut, and spill [\#1799](https://github.com/kokkos/kokkos-kernels/pull/1799) +- Include OpenMP environment variables in benchmark context [\#1789](https://github.com/kokkos/kokkos-kernels/pull/1789) +- Re-enable and clean up triangle counting perf test [\#1752](https://github.com/kokkos/kokkos-kernels/pull/1752) +- Include google/benchmark lib version in benchmark output [\#1750](https://github.com/kokkos/kokkos-kernels/pull/1750) +- Refactor blas2 test for benchmark feature [\#1733](https://github.com/kokkos/kokkos-kernels/pull/1733) +- Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- Refactor blas1 test for benchmark feature [\#1636](https://github.com/kokkos/kokkos-kernels/pull/1636) + +### Cleanup: +- Drop outdated workarounds for backward compatibility with Kokkos [\#1836](https://github.com/kokkos/kokkos-kernels/pull/1836) +- Remove dead code guarded [\#1834](https://github.com/kokkos/kokkos-kernels/pull/1834) +- Remove decl ETI files [\#1824](https://github.com/kokkos/kokkos-kernels/pull/1824) +- Reorganize par_ilut performance test [\#1818](https://github.com/kokkos/kokkos-kernels/pull/1818) +- Deprecate Kokkos::Details::ArithTraits [\#1748](https://github.com/kokkos/kokkos-kernels/pull/1748) +- Drop obsolete workaround #ifdef KOKKOS_IF_ON_HOST [\#1720](https://github.com/kokkos/kokkos-kernels/pull/1720) +- Drop pre Kokkos 3.6 workaround [\#1653](https://github.com/kokkos/kokkos-kernels/pull/1653) +- View::Rank -> View::rank [\#1703](https://github.com/kokkos/kokkos-kernels/pull/1703) +- Prefer Kokkos::View::{R->r}ank [\#1679](https://github.com/kokkos/kokkos-kernels/pull/1679) +- Call concurrency(), not impl_thread_pool_size() [\#1666](https://github.com/kokkos/kokkos-kernels/pull/1666) +- Kokkos moves ALL_t out of Impl namespace [\#1658](https://github.com/kokkos/kokkos-kernels/pull/1658) +- Add KokkosKernels::Impl::are_integral_v helper variable template and quit using Kokkos::Impl::are_integral trait [\#1652](https://github.com/kokkos/kokkos-kernels/pull/1652) + +### Bug Fixes: +- Kokkos 4 compatibility: modifying the preprocessor logic [\#1827](https://github.com/kokkos/kokkos-kernels/pull/1827) +- blas/tpls: Fix gemm include guard typo [\#1848](https://github.com/kokkos/kokkos-kernels/pull/1848) +- spmv cusparse version check modified for cuda/11.1 [\#1828](https://github.com/kokkos/kokkos-kernels/pull/1828) +- Workaround for #1777 - cusparse spgemm test hang [\#1811](https://github.com/kokkos/kokkos-kernels/pull/1811) +- Fix 1798 [\#1800](https://github.com/kokkos/kokkos-kernels/pull/1800) +- BLAS: fixes and testing for LayoutStride [\#1794](https://github.com/kokkos/kokkos-kernels/pull/1794) +- Fix 1786: check that work array is contiguous in SVD [\#1793](https://github.com/kokkos/kokkos-kernels/pull/1793) +- Fix unused variable warnings [\#1790](https://github.com/kokkos/kokkos-kernels/pull/1790) +- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF in Test_Common_UpperBound.hpp [\#1784](https://github.com/kokkos/kokkos-kernels/pull/1784) +- Batched Gesv: initializing variable to make compiler happy [\#1778](https://github.com/kokkos/kokkos-kernels/pull/1778) +- perf test utils: fix device ID parsing [\#1739](https://github.com/kokkos/kokkos-kernels/pull/1739) +- Fix OOB and improve comments in BsrMatrix COO constructor [\#1732](https://github.com/kokkos/kokkos-kernels/pull/1732) +- batched/unit_test: Disable simd dcomplex4 test in for intel > 19.05 and <= 2021. [\#1857](https://github.com/kokkos/kokkos-kernels/pull/1857) +- rocsparse spmv tpl: Fix rocsparse_spmv call for rocm < 5.4.0 [\#1716](https://github.com/kokkos/kokkos-kernels/pull/1716) +- compatibility with 4.0.0 [\#1709](https://github.com/kokkos/kokkos-kernels/pull/1709) +- team mult: fix type issue in max_error calculation [\#1706](https://github.com/kokkos/kokkos-kernels/pull/1706) +- cast Kokkos::Impl::integral_constant to int [\#1697](https://github.com/kokkos/kokkos-kernels/pull/1697) + + ## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01) diff --git a/CMakeLists.txt b/CMakeLists.txt index 19c66c0f64..fa666ab33e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,8 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 0) -SET(KokkosKernels_VERSION_PATCH 1) +SET(KokkosKernels_VERSION_MINOR 1) +SET(KokkosKernels_VERSION_PATCH 00) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file @@ -40,10 +40,8 @@ INCLUDE(GNUInstallDirs) IF (KOKKOSKERNELS_HAS_TRILINOS) SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) SET(KOKKOSKERNELS_HEADER_INSTALL_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM}) ELSEIF(KOKKOSKERNELS_HAS_PARENT) SET(KOKKOSKERNELS_HEADER_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}/kokkos-kernels") - SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM}) ELSE() SET(KOKKOSKERNELS_HEADER_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") ENDIF() @@ -88,6 +86,10 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) include_directories(tpls/rajaperf/src) set(KokkosKernels_ENABLE_PERFTESTS ON CACHE BOOL "Whether to build tests including Perfsuite. Default: OFF" FORCE) ENDIF() + IF(KokkosKernels_ENABLE_BENCHMARK) + SET(KOKKOSKERNELS_ENABLE_BENCHMARK ON CACHE BOOL "Benchmark enabled") + INCLUDE(cmake/kokkoskernels_benchmarks.cmake) + ENDIF() ENDIF () KOKKOSKERNELS_ADD_OPTION( @@ -115,6 +117,7 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) ELSE() @@ -124,7 +127,6 @@ ELSE() # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}") - KOKKOS_CHECK(OPTIONS CUDA_UVM RETURN_VALUE KOKKOS_ENABLE_CUDA_UVM) ENDIF() INCLUDE(cmake/kokkos_backends.cmake) @@ -236,6 +238,7 @@ ELSE() MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") + MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") MESSAGE("") MESSAGE("Kokkos Kernels TPLs") IF(KOKKOSKERNELS_TPL_LIST) @@ -283,6 +286,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) INCLUDE(sparse/CMakeLists.txt) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_ODE) + INCLUDE(ode/CMakeLists.txt) + ENDIF() FOREACH(DIR ${KK_INCLUDE_DIRS}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${DIR}) @@ -299,7 +305,6 @@ ELSE() # This doesn't change pre-existing behavior before the ETI changes #LIST(APPEND HEADERS ${ETI_HEADERS}) #----------------------------------------------------------------------------- - KOKKOSKERNELS_ADD_LIBRARY( kokkoskernels HEADERS ${HEADERS} @@ -315,12 +320,21 @@ ELSE() #no linking commands required - tribits does this ELSE() ADD_LIBRARY(Kokkos::kokkoskernels ALIAS kokkoskernels) + # Address kokkos/kokkos-kernels#1749 + ADD_LIBRARY(KokkosKernels::kokkoskernels ALIAS kokkoskernels) + # all_libs target is required for TriBITS-compliance + ADD_LIBRARY(KokkosKernels::all_libs ALIAS kokkoskernels) TARGET_LINK_LIBRARIES(kokkoskernels PUBLIC Kokkos::kokkos) FOREACH(DIR ${KK_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(kokkoskernels PUBLIC $) ENDFOREACH() TARGET_INCLUDE_DIRECTORIES(kokkoskernels PUBLIC $) + + IF(KokkosKernels_ENABLE_BENCHMARK) + INCLUDE(cmake/kokkoskernels_version_info.cmake) + check_version_info() + ENDIF() ENDIF() # FIXME_SYCL waiting for compiler support @@ -390,6 +404,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_ODE) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) + ENDIF() IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) IF (KokkosKernels_ENABLE_PERFTESTS) MESSAGE(STATUS "Enabling perf tests.") diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 2816620e87..3103dfa8a0 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -1,5 +1,6 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/batched/eti) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/src) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/impl) @@ -16,6 +17,65 @@ IF (NOT KokkosKernels_ENABLE_COMPONENT_BLAS) LIST(APPEND SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/batched/KokkosBatched_Util.cpp) ENDIF() -# Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) -KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/batched) +IF(KokkosKernels_ENABLE_TESTS OR KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) + # Adding unit-tests + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING + ${CMAKE_CURRENT_SOURCE_DIR}/batched) +ENDIF() +# NOTE: Above, the build directory 'batched' is not created unless unit tests +# are actually enabled (which are actually included from the base-level +# CMakeLists.txt file). And the KokkosKernelsTargets.cmake file that gets +# generated from this CMake package in the build dir will be broken if these +# are listed in the `INTERFACE_INCLUDE_DIRECTORIES` property when the build +# `batched` is not created (see Trilinos PR #11966). + +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES +) \ No newline at end of file diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index db369e551d..27fb0bf338 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -69,13 +69,12 @@ struct is_vector : public std::false_type {}; template struct is_same_mag_type { - static const bool is_specialized = - (Kokkos::Details::ArithTraits::is_specialized && - Kokkos::Details::ArithTraits::is_specialized); + static const bool is_specialized = (Kokkos::ArithTraits::is_specialized && + Kokkos::ArithTraits::is_specialized); static const bool is_mag_type_same = - std::is_same::mag_type, - typename Kokkos::Details::ArithTraits::mag_type>::value; + std::is_same::mag_type, + typename Kokkos::ArithTraits::mag_type>::value; static const bool value = is_specialized && is_mag_type_same; }; @@ -625,6 +624,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } +#if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, @@ -635,6 +635,17 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, return transpose_2d_view(sv_nt, layout_tag); } +#else +template +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, + Kokkos::ALL_t i2, Kokkos::ALL_t i3, + const BatchLayout::Left &layout_tag, + const Trans::Transpose) { + auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); + + return transpose_2d_view(sv_nt, layout_tag); +} +#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, @@ -658,6 +669,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } +#if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, @@ -666,6 +678,16 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( return transpose_2d_view(sv_nt, layout_tag); } +#else +template +KOKKOS_INLINE_FUNCTION auto subview_wrapper( + ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, + const BatchLayout::Right &layout_tag, const Trans::Transpose &) { + auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); + + return transpose_2d_view(sv_nt, layout_tag); +} +#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp index cd90657e30..24ecafe0a0 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp @@ -38,8 +38,7 @@ struct SerialAddRadialInternal { #endif for (int i = 0; i < m; ++i) { // const auto a_real = RealPart(A[i*as]); - const auto a_real = - Kokkos::Details::ArithTraits::real(A[i * as]); + const auto a_real = Kokkos::ArithTraits::real(A[i * as]); A[i * as] += ValueType(minus_abs_tiny) * ValueType(a_real < 0); A[i * as] += ValueType(abs_tiny) * ValueType(a_real >= 0); } @@ -62,8 +61,7 @@ struct TeamAddRadialInternal { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { // const auto a_real = RealPart(A[i*as]); - const auto a_real = - Kokkos::Details::ArithTraits::real(A[i * as]); + const auto a_real = Kokkos::ArithTraits::real(A[i * as]); A[i * as] += ValueType(minus_abs_tiny) * ValueType(a_real < 0); A[i * as] += ValueType(abs_tiny) * ValueType(a_real >= 0); }); diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index da8e989b47..611e9440b5 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -56,7 +56,7 @@ struct SerialApplyLeftHouseholderInternal { for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; for (int i = 0; i < m; ++i) - tmp += Kokkos::Details::ArithTraits::conj(u2[i * u2s]) * + tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; w1t[j] = tmp * inv_tau; // /= (*tau); } @@ -109,7 +109,7 @@ struct SerialApplyRightHouseholderInternal { for (int j = 0; j < n; ++j) for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= - w1[i] * Kokkos::Details::ArithTraits::conj(u2[j * u2s]); + w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp index 4d513fcf3d..2754818fbf 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp @@ -59,7 +59,7 @@ struct TeamVectorApplyLeftHouseholderInternal { Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, m), [&](const int &i, value_type &val) { - val += Kokkos::Details::ArithTraits::conj(u2[i * u2s]) * + val += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; }, tmp); @@ -146,8 +146,7 @@ struct TeamVectorApplyRightHouseholderInternal { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, m), [&](const int &i) { A2[i * as0 + j * as1] -= - w1[i] * Kokkos::Details::ArithTraits::conj( - u2[j * u2s]); + w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); }); }); } else { @@ -156,8 +155,7 @@ struct TeamVectorApplyRightHouseholderInternal { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, m), [&](const int &i) { A2[i * as0 + j * as1] -= - w1[i] * Kokkos::Details::ArithTraits::conj( - u2[j * u2s]); + w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); }); }); } diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index ab77c30e83..beaef112f3 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -19,6 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_team_axpby.hpp" namespace KokkosBatched { @@ -177,6 +178,7 @@ struct TeamVectorAxpyInternal { /// /// Serial Impl /// =========== + template KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, const XViewType& X, @@ -188,11 +190,11 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -212,6 +214,9 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, } #endif + // No need to check if X.extent(0)==1 in the serial case as we don't + // parallelize the kernel anyway. + return SerialAxpyInternal::template invoke< typename alphaViewType::non_const_value_type, typename XViewType::non_const_value_type>( @@ -235,11 +240,11 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( "KokkosBatched::axpy: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -259,6 +264,13 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( } #endif + if (X.extent(0) == 1) { + KokkosBlas::Experimental::axpy( + member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamAxpyInternal::template invoke< MemberType, typename alphaViewType::non_const_value_type, typename XViewType::non_const_value_type>( @@ -283,11 +295,11 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( "KokkosBatched::axpy: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -307,6 +319,13 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( } #endif + if (X.extent(0) == 1) { + KokkosBlas::Experimental::axpy( + member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamVectorAxpyInternal::invoke< MemberType, typename alphaViewType::non_const_value_type, typename XViewType::non_const_value_type, diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 110735ca13..2f0be4b661 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -47,6 +47,25 @@ template <> template KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); + return 1; + } +#endif return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); @@ -56,6 +75,25 @@ template <> template KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); + return 1; + } +#endif return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); @@ -93,6 +131,32 @@ struct TeamCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(0) == 1) { + return TeamCopy::invoke( + member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); + } return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); @@ -105,6 +169,32 @@ struct TeamCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(1) == 1) { + return TeamCopy::invoke( + member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); + } return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); @@ -143,6 +233,32 @@ struct TeamVectorCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(0) == 1) { + return TeamVectorCopy::invoke( + member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); + } return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); @@ -155,6 +271,32 @@ struct TeamVectorCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(1) == 1) { + return TeamVectorCopy::invoke( + member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); + } return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index 92d7e7c07f..a6a7673e7b 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -19,6 +19,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_team_dot.hpp" namespace KokkosBatched { @@ -162,6 +163,7 @@ struct TeamVectorDotInternal { /// /// Serial Impl /// =========== + template <> struct SerialDot { template @@ -175,11 +177,11 @@ struct SerialDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -221,11 +223,11 @@ struct SerialDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -256,6 +258,7 @@ struct SerialDot { /// /// Team Impl /// =============== + template struct TeamDot { template @@ -270,11 +273,11 @@ struct TeamDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -295,6 +298,14 @@ struct TeamDot { return 1; } #endif + + if (X.extent(1) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, Kokkos::ALL, 0), + Kokkos::subview(Y, Kokkos::ALL, 0)); + return 0; + } + return TeamDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( @@ -317,11 +328,11 @@ struct TeamDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -341,6 +352,14 @@ struct TeamDot { return 1; } #endif + + if (X.extent(0) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( @@ -352,6 +371,7 @@ struct TeamDot { /// /// TeamVector Impl /// =============== + template struct TeamVectorDot { template @@ -366,11 +386,11 @@ struct TeamVectorDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -391,6 +411,14 @@ struct TeamVectorDot { return 1; } #endif + + if (X.extent(1) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, Kokkos::ALL, 0), + Kokkos::subview(Y, Kokkos::ALL, 0)); + return 0; + } + return TeamVectorDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( @@ -413,11 +441,11 @@ struct TeamVectorDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -437,6 +465,14 @@ struct TeamVectorDot { return 1; } #endif + + if (X.extent(0) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamVectorDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index c1bc0439c5..c857de19c2 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -72,7 +72,7 @@ struct SerialEigendecompositionInternal { "Serial eigendecomposition on device and/or without LAPACK " "is not implemented yet"); // typedef RealType real_type; - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; // const real_type one(1), zero(0), tol = 1e2*ats::epsilon(); // //const Kokkos::pair identity(one, zero); @@ -388,42 +388,10 @@ struct SerialEigendecompositionInternal { const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, const int wlen) { -#if defined(KOKKOS_IF_ON_HOST) KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) // FIXME remove when - // requiring minimum - // version of - // Kokkos 3.6 - // if (as0 == 1 || as1 == 1) { - /// column major or row major and it runs on host - /// potentially it can run tpls internally - // NOTE BMK: If LAPACK not enabled, this will static_assert. - // If neither stride is unit, will runtime assert. - // Otherwise will succeed using LAPACK. - host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, - urs1, w, wlen); - /* - } else { - /// arbitrary strides should be handled by native implementation - device_invoke(m, - A, as0, as1, - er, ers, - ei, eis, - UL, uls0, uls1, - UR, urs0, urs1, - w, wlen); - throw std::runtime_error("Serial eigendecomposition without unit stride - implemented yet."); - } - */ -#else - /// device code runs - device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, - urs1, w, wlen); -#endif return 0; } }; diff --git a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp index ed2e442342..ae4cf10634 100644 --- a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -68,7 +68,7 @@ struct SerialEigenvalueInternal { const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const real_type zero(0), nan(ats::nan()), tol = 1e2 * ats::epsilon(); const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; diff --git a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp index ba4bc0ed9c..21587f4481 100644 --- a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp @@ -82,9 +82,9 @@ struct SerialFrancisInternal { } else { const value_type val = H[(m - 1) * hs]; const auto dist_lambda1 = - Kokkos::Details::ArithTraits::abs(lambda1.real() - val); + Kokkos::ArithTraits::abs(lambda1.real() - val); const auto dist_lambda2 = - Kokkos::Details::ArithTraits::abs(lambda2.real() - val); + Kokkos::ArithTraits::abs(lambda2.real() - val); const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); s = 2 * lambda; diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp index 97d3d70e9d..6b3cec25da 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp @@ -20,7 +20,6 @@ #include "KokkosBatched_Gemm_Serial_Internal.hpp" namespace KokkosBatched { -/********************* BEGIN functor-level routines *********************/ /// /// Serial Impl /// =========== @@ -352,116 +351,6 @@ SerialGemm::invoke( A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } -/********************* END functor-level routines *********************/ - -namespace Impl { -/********************* BEGIN non-functor-level routines *********************/ -template -class BatchedSerialGemm { - private: - AViewType A; - BViewType B; - CViewType C; - ScalarType alpha, beta; - size_t divisor, c_cols, batch_size; - ArgBatchSzDim batch_layout_tag; - ArgTransA transA_tag; - ArgTransB transB_tag; - - void run() { - using execution_space = typename CViewType::device_type::execution_space; - using policy_type = - Kokkos::RangePolicy; - Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), - *this); - } - - public: - int invoke() { - if (std::is_same::value) { - // Set members for ResultsPerThread::Rank0 operator; these members allow - // each thread to calculate its C output index - if (std::is_same::value) { - batch_size = C.extent(0); - divisor = C.extent(1) * C.extent(2); - c_cols = C.extent(2); - } else { - batch_size = C.extent(2); - divisor = C.extent(0) * C.extent(1); - c_cols = C.extent(1); - } - - // Increase the number of threads by the divisor - batch_size *= divisor; - - run(); - } else if (std::is_same::value) { - if (std::is_same::value) - batch_size = C.extent(0); - else - batch_size = C.extent(2); - - run(); - } else { - std::cerr << "Error: ArgResultsPerThread not supported" << std::endl; - return -1; - } - return 0; - } - - BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, - ScalarType _beta, CViewType _C) - : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const ResultsPerThread::Rank0 &, const int &i) const { - // Here, the batch_idx is strided by c_rows * c_cols - auto batch_idx = i / divisor; - // For every batch, we need mod in [0, c_rows*c_cols-1] - auto mod = i % divisor; - // For every mod, we need a column index in [0, c_cols-1] - auto col_idx = mod % c_cols; - // For every mod, we need a row index in [0, c_rows-1] - auto row_idx = mod / c_cols; - - // Due to taking 1-rank subviews out, we must handle transpose here. - // Use overloads of subview_wrapper to handle transpose at compile time. - auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), - batch_layout_tag, transA_tag); - auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, - batch_layout_tag, transB_tag); - auto svC_ele = - subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); - - // Kokkos::subview(scalar, ALL) or Kokkos::subview(ALL, scalar) always - // returns a column vector. Since the subviews above handle the - // matrix transpositions, here we must perform the GEMM on: - // row_vec x col_vec, which is svA_row' x svB_col to compute the element - // of C. - KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, - svC_ele); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const ResultsPerThread::Rank2 &, const int &i) const { - auto svA = - subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svB = - subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svC = - subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - - KokkosBatched::SerialGemm::invoke( - alpha, svA, svB, beta, svC); - } -}; -/********************* END non-functor-level routines *********************/ -} // namespace Impl - } // namespace KokkosBatched #endif diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 12b2d88250..a0b948bb13 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -20,6 +20,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemv_TeamVector_Internal.hpp" +#include "KokkosBlas2_team_gemv.hpp" namespace KokkosBatched { @@ -45,9 +46,20 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); + if (A.extent(0) == 1) { + KokkosBlas::TeamVectorGemv< + MemberType, Trans::NoTranspose, + Algo::Gemv::Unblocked>::invoke(member, alpha, + Kokkos::subview(A, 0, Kokkos::ALL, + Kokkos::ALL), + Kokkos::subview(x, 0, Kokkos::ALL), + beta, + Kokkos::subview(y, 0, Kokkos::ALL)); + return 0; + } return TeamVectorGemvInternal::template invoke< MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( @@ -67,7 +79,7 @@ struct TeamVectorGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); Kokkos::abort( @@ -87,7 +99,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); return TeamVectorGemvInternal::template invoke< @@ -109,7 +121,7 @@ struct TeamVectorGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); Kokkos::abort( diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 18cd78fd31..48627aaf30 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -20,6 +20,8 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" +#include "KokkosBlas2_team_gemv.hpp" +#include namespace KokkosBatched { @@ -45,9 +47,27 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } + + if (A.extent(0) == 1) { + KokkosBlas::TeamGemv< + MemberType, Trans::NoTranspose, + Algo::Gemv::Unblocked>::invoke(member, alpha, + Kokkos::subview(A, 0, Kokkos::ALL, + Kokkos::ALL), + Kokkos::subview(x, 0, Kokkos::ALL), + beta, + Kokkos::subview(y, 0, Kokkos::ALL)); + return 0; + } return TeamGemvInternal::template invoke< MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( @@ -67,9 +87,15 @@ struct TeamGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + /* if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } */ Kokkos::abort( "KokkosBlas::TeamGemv for rank-3 matrix is NOT " "implemented"); @@ -87,9 +113,23 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } + if (A.extent(0) == 1) { + KokkosBlas:: + TeamGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); + return 0; + } return TeamGemvInternal::template invoke< MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( @@ -109,9 +149,15 @@ struct TeamGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + /* if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } */ Kokkos::abort( "KokkosBlas::TeamGemv for rank-3 matrix is NOT " "implemented"); diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index c98aa08788..0ef43ee4f8 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -290,7 +290,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( for (size_t i = 0; i < n; ++i) { int row_index, col_index; - reducer_value_type value; + reducer_value_type value{}; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, n), @@ -376,9 +376,9 @@ struct SerialGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -449,9 +449,9 @@ struct SerialGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -501,9 +501,9 @@ struct TeamGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -579,9 +579,9 @@ struct TeamGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -637,9 +637,9 @@ struct TeamVectorGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -716,9 +716,9 @@ struct TeamVectorGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp index f20e754010..4d80c6a250 100644 --- a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp @@ -54,13 +54,12 @@ struct SerialGivensInternal { } else { // here we do not care overflow caused by the division although it is // probable.... - r = Kokkos::Details::ArithTraits::sqrt(chi1 * chi1 + - chi2 * chi2); + r = Kokkos::ArithTraits::sqrt(chi1 * chi1 + chi2 * chi2); cs = chi1 / r; sn = chi2 / r; - if (Kokkos::Details::ArithTraits::abs(chi1) > - Kokkos::Details::ArithTraits::abs(chi2) && + if (Kokkos::ArithTraits::abs(chi1) > + Kokkos::ArithTraits::abs(chi2) && cs < zero) { cs = -cs; sn = -sn; diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 7dcdc78811..ebd789c2e8 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -101,11 +101,11 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, static_assert( Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::Rank == 2, + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -152,11 +152,11 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( static_assert( Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::Rank == 2, + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -205,11 +205,11 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( static_assert( Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::Rank == 2, + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp index eba6cdfc59..3d2b75e64d 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -37,7 +37,7 @@ struct SerialHessenbergQR_WithShiftInternal { /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift, /* */ Kokkos::pair *GG, const bool request_schur) { typedef ValueType value_type; - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; const int hs = hs0 + hs1; const value_type zero(0), one(1); diff --git a/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp similarity index 76% rename from batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp rename to batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 16355654c1..971fb36081 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_ARMPL_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_ARMPL_IMPL_HPP__ +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" @@ -22,6 +22,51 @@ namespace KokkosBatched { namespace Impl { /********************* BEGIN non-functor-level routines *********************/ + +// clang-format off +/// \brief Blocking general matrix multiply on a batch of uniform matrices. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam HandleType Specifies the handle type of the kernel handle +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as a 3-rank Kokkos::View +/// \tparam BViewType Input matrix, as a 3-rank Kokkos::View +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as a 3-rank +/// Kokkos::View +/// +/// See struct BatchedGemmHandle for details +/// \param handle [in] A handle which specifies how to invoke the batched +/// gemm. handle->get_tpl_params() returns &ninter. +/// ninter: The number of matrices to interleave. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// + +/// Usage Example: +/// BatchedArmplGemm +/// (handle, alpha, A, B, beta, C).invoke(); +// clang-format on template diff --git a/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp similarity index 84% rename from batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp rename to batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index 301b962fcb..50d662b281 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_DBLBUF_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_DBLBUF_IMPL_HPP__ +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" @@ -57,6 +57,75 @@ using TagFromLayout = typename TagFromLayoutHelper::tag; // Option 2: Fix league_size and have single team solve full tile followed // by same team solving extra rows/cols (without multiplying by the // zero rows/cols) + +// clang-format off +/// \brief Non-blocking general matrix multiply on a batch of +/// uniform matrices with an algorithm based on: +/// B. P. D. J. Kunkel, Julian, “Performance, design, and autotuning of batched gemm for GPUs,” +/// in Lecture Notes in Computer Science, ser. ISC High Performance Computing ’16, vol. 9697, 06 2016. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchSzDim::Left Batch dimension is leftmost +/// BatchSzDim::Right Batch dimension is rightmost +/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For +/// this serial interface, each rank specifies how +/// much work to assign a single thread. +/// ResultsPerThread::Rank0 Each thread computes a scalar of C +/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C +/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C +/// \tparam HandleType Specifies the handle type of the kernel handle +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// \tparam ArgBoundsCheck Specifies whether to perform global memory access +/// bounds checks within the functor. Bounds checks +/// are required when matrix sizes are not evenly divisible +/// by tile sizes. +/// BoundsCheck::Yes The functor will perform bound checks (recommended) +/// BoundsCheck::No The functor will NOT perform bound checks +/// \tparam ArgAlphaFmaTag Specifies whether to apply alpha during fmas. +/// AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB). +/// AlphaFmaTag::No alpha will be applied during mul (A * B * alpha). +/// \tparam TILE_M Specifies the number of rows in each tile. +/// \tparam TILE_N Specifies the number of cols in each tile. +/// \tparam TILE_K Specifies the number of cols or rows in a tile of A or tile of B, respectively. +/// +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); +// clang-format on template +#include // Trans, BatchLayout +#include +#include + +#include "KokkosBatched_HostLevel_Gemm_Handle.hpp" // BatchedGemmHandle +#include "KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp" +#include "KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp" + +namespace KokkosBatched { +namespace Impl { +//////////////////////////////// tile_m ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_m() { + return 32; +} +//////////////////////////////// tile_n ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_n() { + return 32; +} +//////////////////////////////// tile_k ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { + return 8; +} + +// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails +// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547. +// This reduces the register allocations (REG_M and REG_N) in the double +// buffering algorithm by a factor of 2. +#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) +template <> +constexpr KOKKOS_INLINE_FUNCTION int +kk_gemm_dbl_buf_tile_k() { + return 16; +} +#endif +////////////////////////// alpha_in_fma_thresh //////////////////////////// +constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { +#ifdef __CUDACC_RDC__ + return 24; +#else + return 64; +#endif // __CUDAACC_RDC__ +} + +template +int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, + const ScalarType beta, const CViewType &C) { + int ret = 0; + size_t c_m, c_n; + using ViewValueType = typename CViewType::value_type; + // Check for valid input views + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "CViewType must be a Kokkos::View."); + static_assert( + std::is_same::value || + std::is_same::value, + "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); + static_assert( + std::is_same::value || + std::is_same::value, + "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); + if constexpr (is_vector::value) { + // Check ranks of view with underlying SIMD value types + // For SIMD views, we can have either 3-rank or 4-ranks inputs. + switch (handle->get_kernel_algo_type()) { + case BaseKokkosBatchedAlgos::KK_SERIAL: + case BaseHeuristicAlgos::SQUARE: + case BaseTplAlgos::ARMPL: +#if KOKKOS_VERSION > 40099 + assert(A.rank_dynamic() == 3 && "AViewType must have rank 3."); + assert(B.rank_dynamic() == 3 && "BViewType must have rank 3."); + assert(C.rank_dynamic() == 3 && "CViewType must have rank 3."); +#endif + break; + default: + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) + << " with SIMD views." << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + break; + } + } else { + // Check ranks of views with underlying scalar value types + static_assert(static_cast(AViewType::rank) == 3, + "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, + "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, + "CViewType must have rank 3."); + } + + // Check for valid data access patterns + // Skip checking a_layout == b_layout == c_layout + // Skip checking for LayoutStride + using c_layout = typename CViewType::array_layout; + static_assert(!(std::is_same::value && + !std::is_same::value), + "LayoutLeft views require BatchLayout::Right"); + static_assert(!(std::is_same::value && + !std::is_same::value), + "LayoutRight views require BatchLayout::Left"); + + if constexpr (std::is_same::value) { + // c_b = C.extent(0); + c_m = C.extent(1); + c_n = C.extent(2); + } else { + // c_b = C.extent(2); + c_m = C.extent(0); + c_n = C.extent(1); + } + + // Begin checking conditions for optimal BatchedGemm invocation. + using view_scalar_type = typename CViewType::value_type; + using layout_type = typename CViewType::array_layout; + using exec_space = typename CViewType::execution_space; + constexpr bool is_vector = KokkosBatched::is_vector::value; + constexpr bool on_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< + typename exec_space::memory_space>(); + constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< + typename exec_space::memory_space>(); + bool out_of_range = false; + + if (handle->enableDebug) { + std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() + << std::endl + << "execution_space:" << typeid(exec_space).name() << std::endl + << std::endl + << "is_vector:" << is_vector << std::endl + << "on_gpu:" << on_gpu << std::endl + << "on_x86_64:" << on_x86_64 << std::endl + << "on_a64fx:" << on_a64fx << std::endl; + } + + switch (handle->get_kernel_algo_type()) { + ////////////// HEURISTIC ALGOS ////////////// + case BaseHeuristicAlgos::SQUARE: + if (c_m != c_n) { + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" + << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" + << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Select optimal resultsPerThread param for BatchedSerialGemm + using bsgResultsPerThread = + std::conditional_t; + + // Select optimal mode param for SerialGemm. + using bsgModeType = typename std::conditional< + is_vector, + typename std::conditional::type, + typename std::conditional< + on_gpu, Algo::Gemm::Unblocked, + typename std::conditional::type>::type>:: + type; + + if (handle->enableDebug) { + std::cout << "bsgResultsPerThread: " + << typeid(bsgResultsPerThread).name() << std::endl + << "bsgModeType: " << typeid(bsgModeType).name() << std::endl; + } + + if constexpr (on_gpu) { + if (((std::is_same::value) + ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { + handle->teamSz = handle->vecLen = 8; + constexpr int tile_m = Impl::kk_gemm_dbl_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dbl_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dbl_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = + Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + + if (c_m % 32 == 0) { // No bounds checking + if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } else { // apply alpha in mul + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } + } else { // bounds checking + if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } else { // apply alpha in mul + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } + } + } else { + out_of_range = true; + } + } + if (!on_gpu || out_of_range) { + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); + } + break; + + // case BaseHeuristicAlgos::TALL: + // + // case BaseHeuristicAlgos::WIDE: + ////////////// TPL ALGOS ////////////// +#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 + case BaseTplAlgos::ARMPL: + ret = Impl::BatchedArmplGemm(handle, alpha, A, B, + beta, C) + .invoke(); + break; +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL + // case BaseTplAlgos::MKL: + // + // case GemmTplAlgos::CUBLAS: + // + // case GemmTplAlgos::MAGMA: + + ////////////// KokkosBatched ALGOS ////////////// + case BaseKokkosBatchedAlgos::KK_SERIAL: + ret = + Impl::BatchedSerialGemm( + alpha, A, B, beta, C) + .invoke(); + break; + + // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: + + case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: + ret = + Impl::BatchedSerialGemm( + alpha, A, B, beta, C) + .invoke(); + break; + + // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: + // case GemmKokkosBatchedAlgos::KK_TEAM: + // case GemmKokkosBatchedAlgos::KK_TEAMVECTOR: + // case GemmKokkosBatchedAlgos::KK_TEAMSIMD: + + case GemmKokkosBatchedAlgos::KK_DBLBUF: + // Note: The tile sizes of 1x1x1 here will not perform well but must be + // selected in order to function on all devices since the serial + // execution space has a max team size of 1. KokkosKernels API users + // will need to follow an approach similar to KK_SQUARE above for best + // performance. + + // TODO: Add auto-selection of tile size based on inputs and device type + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + break; + + default: + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) << "." << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + break; + } + return ret; +} +} // namespace Impl +} // namespace KokkosBatched +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp new file mode 100644 index 0000000000..5ff581bb64 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ +#include "KokkosBatched_Gemm_Decl.hpp" + +namespace KokkosBatched { +namespace Impl { +// clang-format off +/// \brief Non-blocking general matrix multiply on a batch of +/// uniform matrices. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgMode Specifies algorithm mode to use for serial work: +/// Algo::Gemm::Unblocked for no register blocking +/// Algo::Gemm::Blocked for register blocking +/// Algo::Gemm::CompactMKL for mkl compact tpl interface +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchSzDim::Left Batch dimension is leftmost +/// BatchSzDim::Right Batch dimension is rightmost +/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For +/// this serial interface, each rank specifies how +/// much work to assign a single thread. +/// ResultsPerThread::Rank0 Each thread computes a scalar of C +/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C +/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); +// clang-format on +template +class BatchedSerialGemm { + private: + AViewType A; + BViewType B; + CViewType C; + ScalarType alpha, beta; + size_t divisor, c_cols, batch_size; + ArgBatchSzDim batch_layout_tag; + ArgTransA transA_tag; + ArgTransB transB_tag; + + void run() { + using execution_space = typename CViewType::device_type::execution_space; + using policy_type = + Kokkos::RangePolicy; + Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), + *this); + } + + public: + int invoke() { + if (std::is_same::value) { + // Set members for ResultsPerThread::Rank0 operator; these members allow + // each thread to calculate its C output index + if (std::is_same::value) { + batch_size = C.extent(0); + divisor = C.extent(1) * C.extent(2); + c_cols = C.extent(2); + } else { + batch_size = C.extent(2); + divisor = C.extent(0) * C.extent(1); + c_cols = C.extent(1); + } + + // Increase the number of threads by the divisor + batch_size *= divisor; + + run(); + } else if (std::is_same::value) { + if (std::is_same::value) + batch_size = C.extent(0); + else + batch_size = C.extent(2); + + run(); + } else { + std::cerr << "Error: ArgResultsPerThread not supported" << std::endl; + return -1; + } + return 0; + } + + BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, + ScalarType _beta, CViewType _C) + : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ResultsPerThread::Rank0 &, const int &i) const { + // Here, the batch_idx is strided by c_rows * c_cols + auto batch_idx = i / divisor; + // For every batch, we need mod in [0, c_rows*c_cols-1] + auto mod = i % divisor; + // For every mod, we need a column index in [0, c_cols-1] + auto col_idx = mod % c_cols; + // For every mod, we need a row index in [0, c_rows-1] + auto row_idx = mod / c_cols; + + // Due to taking 1-rank subviews out, we must handle transpose here. + // Use overloads of subview_wrapper to handle transpose at compile time. + auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), + batch_layout_tag, transA_tag); + auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, + batch_layout_tag, transB_tag); + auto svC_ele = + subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); + + // Kokkos::subview(scalar, ALL) or Kokkos::subview(ALL, scalar) always + // returns a column vector. Since the subviews above handle the + // matrix transpositions, here we must perform the GEMM on: + // row_vec x col_vec, which is svA_row' x svB_col to compute the element + // of C. + KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, + svC_ele); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ResultsPerThread::Rank2 &, const int &i) const { + auto svA = + subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svB = + subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svC = + subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + + KokkosBatched::SerialGemm::invoke( + alpha, svA, svB, beta, svC); + } +}; +} // namespace Impl +} // namespace KokkosBatched +#endif \ No newline at end of file diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp new file mode 100644 index 0000000000..6ec792172b --- /dev/null +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -0,0 +1,290 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ + +#include +#include +#include // BatchedGemmHandle + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosBatched_HostLevel_Gemm_Impl.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#endif + +namespace KokkosBatched { +namespace Impl { +// Specialization struct which defines whether a specialization exists +// This struct is currently never specialized. +template +struct batched_gemm_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Specialization struct which defines whether a specialization exists +template +struct batched_gemm_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBatched + +// ETI specalization macros, consumed by generated *_eti_spec_avail.hpp files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct batched_gemm_eti_spec_avail< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +// Include the BLL ETI specalizations +#include +#include +#include +#include + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +// Include the BLR ETI specalizations +#include +#include +#include +#include + +namespace KokkosBatched { +namespace Impl { +template ::value, + bool eti_spec_avail = batched_gemm_eti_spec_avail< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, + ScalarType, AViewType, BViewType, CViewType>::value> +struct BatchedGemmSpec { + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, const ScalarType beta, + const CViewType &C) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { +#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION +#if KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + printf( + "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " + "%s, %s, %s, %s, %s >\n", + typeid(ArgTransA).name(), typeid(ArgTransB).name(), + typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), + typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(BViewType).name(), typeid(CViewType).name()); +#else + printf( + "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " + "%s, %s, %s, %s, %s, %s >\n", + typeid(ArgTransA).name(), typeid(ArgTransB).name(), + typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), + typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(BViewType).name(), typeid(CViewType).name()); +#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#endif // KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION + return KokkosBatched::Impl::BatchedGemmImpl< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, + AViewType, BViewType, CViewType>(handle, alpha, A, B, beta, C); + } +#else + ; +#endif // !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; +} // namespace Impl +} // namespace KokkosBatched + +// ETI instantiation macros, consumed by *.cpp.in files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct BatchedGemmSpec< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp index bf2bd7d954..05654a2f37 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp @@ -35,7 +35,7 @@ struct SerialLeftHouseholderInternal { /* */ ValueType* x2, const int x2s, /* */ ValueType* tau) { typedef ValueType value_type; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; const mag_type zero(0); const mag_type half(0.5); @@ -58,11 +58,10 @@ struct SerialLeftHouseholderInternal { } /// compute magnitude of chi1, equal to norm2 of chi1 - const mag_type norm_chi1 = - Kokkos::Details::ArithTraits::abs(*chi1); + const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::Details::ArithTraits::sqrt( + const mag_type norm_x = Kokkos::ArithTraits::sqrt( norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp index 40cc0714e3..64fe24fa31 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -36,7 +36,7 @@ struct TeamVectorLeftHouseholderInternal { /* */ ValueType *x2, const int x2s, /* */ ValueType *tau) { typedef ValueType value_type; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; const mag_type zero(0); const mag_type half(0.5); @@ -64,11 +64,10 @@ struct TeamVectorLeftHouseholderInternal { } /// compute magnitude of chi1, equal to norm2 of chi1 - const mag_type norm_chi1 = - Kokkos::Details::ArithTraits::abs(*chi1); + const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::Details::ArithTraits::sqrt( + const mag_type norm_x = Kokkos::ArithTraits::sqrt( norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp index 4b9c215aba..e6b34d8f1b 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp @@ -62,7 +62,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( if (tiny != 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; const auto alpha11_real = - Kokkos::Details::ArithTraits::real(alpha11_reference); + Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp index a5a033b451..cbc811de5e 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp @@ -68,7 +68,7 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( if (member.team_rank() == 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; const auto alpha11_real = - Kokkos::Details::ArithTraits::real(alpha11_reference); + Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } diff --git a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp index 9a4ce3378d..ea87217a37 100644 --- a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp @@ -52,7 +52,7 @@ struct SerialLeftEigenvectorFromSchurInternal { /* */ ValueType *w, const int *blks) { typedef ValueType value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; typedef Kokkos::complex complex_type; diff --git a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp index e2a7016422..42adf8eeba 100644 --- a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp @@ -31,7 +31,7 @@ struct SerialNormalizeInternal { /* */ ValueType *KOKKOS_RESTRICT v, const int vs) { typedef ValueType value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; mag_type norm(0); @@ -42,7 +42,7 @@ struct SerialNormalizeInternal { const auto v_at_i = v[i * vs]; norm += ats::real(v_at_i * ats::conj(v_at_i)); } - norm = Kokkos::Details::ArithTraits::sqrt(norm); + norm = Kokkos::ArithTraits::sqrt(norm); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -58,7 +58,7 @@ struct SerialNormalizeInternal { /* */ RealType *KOKKOS_RESTRICT vi, const int vis) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; mag_type norm(0); @@ -70,7 +70,7 @@ struct SerialNormalizeInternal { const auto vi_at_i = vi[i * vis]; norm += vr_at_i * vr_at_i + vi_at_i * vi_at_i; } - norm = Kokkos::Details::ArithTraits::sqrt(norm); + norm = Kokkos::ArithTraits::sqrt(norm); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif diff --git a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp index 2eeb3ccbed..4716506064 100644 --- a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp @@ -52,7 +52,7 @@ struct SerialRightEigenvectorFromSchurInternal { /* */ ValueType *w, const int *blks) { typedef ValueType value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; typedef Kokkos::complex complex_type; diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index 5a2cc638c4..20dab77092 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -29,6 +29,19 @@ KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A, const SViewType &sigma, const VViewType &Vt, const WViewType &work) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, + "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && UViewType::rank == 2, + "SVD: U must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, + "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && VViewType::rank == 2, + "SVD: V must be a rank-2 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, + "SVD: W must be a rank-1 view"); + static_assert( + !std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), @@ -41,6 +54,15 @@ template KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A, const SViewType &sigma, const WViewType &work) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, + "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, + "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, + "SVD: W must be a rank-1 view"); + static_assert( + !std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), nullptr, 0, diff --git a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp index 9e305186df..22a599ed58 100644 --- a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp @@ -37,7 +37,7 @@ struct SerialSchur2x2Internal { Kokkos::complex* lambda2, bool* is_complex) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const real_type zero(0), one(1), half(0.5), minus_one(-1); /// compute G = [ gamma -sigma; /// sigma gamma ]; diff --git a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp index 2ff19975fc..c7f35d5c4f 100644 --- a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp @@ -76,7 +76,7 @@ struct SerialSchurInternal { const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const real_type /* one(1), */ zero(0), tol = 1e2 * ats::epsilon(); const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; if (wlen < m * 5) diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index e08089593a..3e4024974b 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -77,7 +77,7 @@ SerialTrmmInternalLeftLower::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = am; int right_n = bn; // echo-TODO: See about coniditionally setting conjOp at compile time. @@ -162,7 +162,7 @@ SerialTrmmInternalRightLower::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = bm; int right_n = an; // echo-TODO: See about coniditionally setting conjOp at compile time. @@ -248,7 +248,7 @@ SerialTrmmInternalLeftUpper::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = am; int right_n = bn; // echo-TODO: See about coniditionally setting conjOp at compile time. @@ -330,7 +330,7 @@ SerialTrmmInternalRightUpper::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = bm; int right_n = an; // echo-TODO: See about coniditionally setting conjOp at compile time. diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp index 94f662a0a8..f87492ea5a 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp @@ -152,7 +152,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator++(Vector, l> &a, int) { Vector, l> a0 = a; - a = a + typename Kokkos::Details::ArithTraits::mag_type(1); + a = a + typename Kokkos::ArithTraits::mag_type(1); return a0; } @@ -160,7 +160,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( T, l) operator++(Vector, l> &a) { - a = a + typename Kokkos::Details::ArithTraits::mag_type(1); + a = a + typename Kokkos::ArithTraits::mag_type(1); return a; } @@ -355,7 +355,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator--(Vector, l> &a, int) { Vector, l> a0 = a; - a = a - typename Kokkos::Details::ArithTraits::mag_type(1); + a = a - typename Kokkos::ArithTraits::mag_type(1); return a0; } @@ -363,7 +363,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( T, l) operator--(Vector, l> &a) { - a = a - typename Kokkos::Details::ArithTraits::mag_type(1); + a = a - typename Kokkos::ArithTraits::mag_type(1); return a; } diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp index 19f4fcb54f..69bbb53c6b 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp @@ -32,7 +32,7 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) sqrt(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -48,7 +48,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) cbrt(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -64,7 +64,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -80,7 +80,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log10(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -96,7 +96,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) exp(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -112,7 +112,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l) pow(const Vector, l> &a, const Vector, l> &b) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sin(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -156,7 +156,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cos(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -172,7 +172,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tan(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -188,7 +188,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sinh(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -204,7 +204,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cosh(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -220,7 +220,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tanh(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -236,7 +236,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) asin(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -252,7 +252,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) acos(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -268,7 +268,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) atan(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -284,7 +284,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) atan2(const Vector, l> &a, const Vector, l> &b) { - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp index fb0c9b1f48..3fb7ac872b 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp @@ -16,6 +16,8 @@ #ifndef __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ #define __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ +#include + /// \author Kyungjoo Kim (kyukim@sandia.gov) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch" @@ -94,20 +96,20 @@ struct SimdViewAccess { /// rank 1 template - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::value && - 1 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 1 == ViewType::rank, + reference_type> + operator()(const I0 &i0, Args... /*args*/) const { return _a(i0 / vector_length)[i0 % vector_length]; } /// rank 2 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && 2 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && + 2 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1)[i0 % vector_length]; case 1: break; @@ -118,10 +120,10 @@ struct SimdViewAccess { /// rank 3 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 3 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2)[i0 % vector_length]; @@ -134,10 +136,10 @@ struct SimdViewAccess { /// rank 4 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 4 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, Args... /*args*/) const { switch (PackDim::value) { @@ -153,10 +155,10 @@ struct SimdViewAccess { /// rank 5 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 5 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, Args... /*args*/) const { switch (PackDim::value) { @@ -173,10 +175,10 @@ struct SimdViewAccess { /// rank 6 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 6 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, Args... /*args*/) const { switch (PackDim::value) { @@ -199,12 +201,14 @@ struct SimdViewAccess { /// rank 7 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && - 7 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && + 7 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5, const I6 &i6, + Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, @@ -233,14 +237,14 @@ struct SimdViewAccess { /// rank 8 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && - 8 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7, - Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && + 8 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7, + Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6, diff --git a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp index 6b90b6a962..0d3a9b3df9 100644 --- a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -52,18 +52,16 @@ struct SerialWilkinsonShiftInternal { if (v < 0) { // complex - const value_type sqrt_v = - Kokkos::Details::ArithTraits::sqrt(-v); - *lambda1 = Kokkos::complex(p, sqrt_v); - *lambda2 = Kokkos::complex(p, -sqrt_v); - *is_complex = true; + const value_type sqrt_v = Kokkos::ArithTraits::sqrt(-v); + *lambda1 = Kokkos::complex(p, sqrt_v); + *lambda2 = Kokkos::complex(p, -sqrt_v); + *is_complex = true; } else { // real - const value_type sqrt_v = - Kokkos::Details::ArithTraits::sqrt(v); - *lambda1 = Kokkos::complex(p + sqrt_v); - *lambda2 = Kokkos::complex(p - sqrt_v); - *is_complex = false; + const value_type sqrt_v = Kokkos::ArithTraits::sqrt(v); + *lambda1 = Kokkos::complex(p + sqrt_v); + *lambda2 = Kokkos::complex(p - sqrt_v); + *is_complex = false; } return 0; } diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index ba144cc778..4f90c0be38 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -197,9 +197,9 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::Rank == 2, + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -240,9 +240,9 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( "KokkosBatched::xpay: ViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::Rank == 2, + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -284,9 +284,9 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( "KokkosBatched::xpay: ViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::Rank == 2, + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index d182197192..1febcaa771 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -16,16 +16,9 @@ #ifndef __KOKKOSBATCHED_GEMM_DECL_HPP__ #define __KOKKOSBATCHED_GEMM_DECL_HPP__ -#include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" -// Includes for non-functor-level routines -#include -#include -#include - namespace KokkosBatched { -/********************* BEGIN functor-level routines *********************/ /// /// Serial Gemm /// @@ -91,574 +84,10 @@ struct Gemm { return r_val; } }; -/********************* END functor-level routines *********************/ - -/********************* BEGIN non-functor-level routines *********************/ - -namespace Impl { -/********************* BEGIN forward declarations *********************/ -// clang-format off -/// \brief Non-blocking solve of general matrix multiply on a batch of -/// uniform matrices. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgMode Specifies algorithm mode to use for serial work: -/// Algo::Gemm::Unblocked for no register blocking -/// Algo::Gemm::Blocked for register blocking -/// Algo::Gemm::CompactMKL for mkl compact tpl interface -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchSzDim::Left Batch dimension is leftmost -/// BatchSzDim::Right Batch dimension is rightmost -/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For -/// this serial interface, each rank specifies how -/// much work to assign a single thread. -/// ResultsPerThread::Rank0 Each thread computes a scalar of C -/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C -/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedSerialGemm; - -// clang-format off -/// \brief Non-blocking solve of general matrix multiply on a batch of -/// uniform matrices with an algorithm based on: -/// B. P. D. J. Kunkel, Julian, “Performance, design, and autotuning of batched gemm for GPUs,” -/// in Lecture Notes in Computer Science, ser. ISC High Performance Computing ’16, vol. 9697, 06 2016. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchSzDim::Left Batch dimension is leftmost -/// BatchSzDim::Right Batch dimension is rightmost -/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For -/// this serial interface, each rank specifies how -/// much work to assign a single thread. -/// ResultsPerThread::Rank0 Each thread computes a scalar of C -/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C -/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C -/// \tparam HandleType Specifies the handle type of the kernel handle -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// \tparam ArgBoundsCheck Specifies whether to perform global memory access -/// bounds checks within the functor. Bounds checks -/// are required when matrix sizes are not evenly divisible -/// by tile sizes. -/// BoundsCheck::Yes The functor will perform bound checks (recommended) -/// BoundsCheck::No The functor will NOT perform bound checks -/// \tparam ArgAlphaFmaTag Specifies whether to apply alpha during fmas. -/// AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB). -/// AlphaFmaTag::No alpha will be applied during mul (A * B * alpha). -/// \tparam TILE_M Specifies the number of rows in each tile. -/// \tparam TILE_N Specifies the number of cols in each tile. -/// \tparam TILE_K Specifies the number of cols or rows in a tile of A or tile of B, respectively. -/// -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedDblBufGemm; - -//////////////////////////////// tile_m ////////////////////////////////// -template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() { - return 32; -} -//////////////////////////////// tile_n ////////////////////////////////// -template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() { - return 32; -} -//////////////////////////////// tile_k ////////////////////////////////// -template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() { - return 8; -} - -// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails -// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547. -// This reduces the register allocations (REG_M and REG_N) in the double -// buffering algorithm by a factor of 2. -#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) -template <> -constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dlb_buf_tile_k() { - return 16; -} -#endif -////////////////////////// alpha_in_fma_thresh //////////////////////////// -constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { -#ifdef __CUDACC_RDC__ - return 24; -#else - return 64; -#endif // __CUDAACC_RDC__ -} - -// clang-format off -/// \brief Blocking solve of general matrix multiply on a batch of uniform matrices. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam HandleType Specifies the handle type of the kernel handle -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as a 3-rank Kokkos::View -/// \tparam BViewType Input matrix, as a 3-rank Kokkos::View -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as a 3-rank -/// Kokkos::View -/// -/// See struct BatchedGemmHandle for details -/// \param handle [in] A handle which specifies how to invoke the batched -/// gemm. handle->get_tpl_params() returns &ninter. -/// ninter: The number of matrices to interleave. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// - -/// Usage Example: -/// BatchedArmplGemm -/// (handle, alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedArmplGemm; -/********************* END forward declarations *********************/ -} // namespace Impl - -// clang-format off -/// \brief Non-blocking solve of general matrix multiply on a batch of -/// uniform matrices. -/// -/// Note: If a TPL is selected, this interface follows the blocking -/// behavior (either blocking or non-blocking) of the TPL vendor's API. -/// -/// Note: To leverage SIMD instructions, 4-rank views must be selected via the -/// template parameters documented below. -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchLayout::Left Batch dimension is leftmost -/// BatchLayout::Right Batch dimension is rightmost -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// -/// \param handle [in] A handle which specifies how to invoke the batched -/// gemm. -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchLayout::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchLayout::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchLayout::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedGemm(handle, alpha, A, B, beta, C); -// clang-format on -template -int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) { - int ret = 0; - size_t c_m, c_n; - using ViewValueType = typename CViewType::value_type; - // Check for valid input views - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); - if (is_vector::value) { - // Check ranks of view with underlying SIMD value types - // For SIMD views, we can have either 3-rank or 4-ranks inputs. - switch (handle->get_kernel_algo_type()) { - case BaseKokkosBatchedAlgos::KK_SERIAL: - case BaseHeuristicAlgos::SQUARE: - case BaseTplAlgos::ARMPL: - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); - break; - - // TODO: check this once KK_TEAM is supported - // case GemmKokkosBatchedAlgos::KK_TEAM: - // static_assert(static_cast(AViewType::rank) == 4, - // "AViewType must have rank 4."); - // static_assert(static_cast(BViewType::rank) == 4, - // "BViewType must have rank 4."); - // static_assert(static_cast(CViewType::rank) == 4, - // "CViewType must have rank 4."); - // break; - - default: - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) - << " with SIMD views." << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - break; - } - } else { - // Check ranks of views with underlying scalar value types - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); - } - - // Check for valid data access patterns - // Skip checking a_layout == b_layout == c_layout - // Skip checking for LayoutStride - using c_layout = typename CViewType::array_layout; - if (std::is_same::value && - !std::is_same::value) { - throw std::runtime_error( - "Error: LayoutLeft views require BatchLayout::Right"); - } - if (std::is_same::value && - !std::is_same::value) { - throw std::runtime_error( - "Error: LayoutRight views require BatchLayout::Left"); - } - - if (std::is_same::value) { - // c_b = C.extent(0); - c_m = C.extent(1); - c_n = C.extent(2); - } else { - // c_b = C.extent(2); - c_m = C.extent(0); - c_n = C.extent(1); - } - - // Begin checking conditions for optimal BatchedGemm invocation. - using view_scalar_type = typename CViewType::value_type; - using layout_type = typename CViewType::array_layout; - using exec_space = typename CViewType::execution_space; - constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< - typename exec_space::memory_space>(); - constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< - typename exec_space::memory_space>(); - - if (handle->enableDebug) { - std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() - << std::endl - << "execution_space:" << typeid(exec_space).name() << std::endl - << std::endl - << "is_vector:" << is_vector << std::endl - << "on_gpu:" << on_gpu << std::endl - << "on_x86_64:" << on_x86_64 << std::endl - << "on_a64fx:" << on_a64fx << std::endl; - } - - switch (handle->get_kernel_algo_type()) { - ////////////// HEURISTIC ALGOS ////////////// - case BaseHeuristicAlgos::SQUARE: - if (c_m != c_n) { - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" - << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" - << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - // Select optimal resultsPerThread param for BatchedSerialGemm - using bsgResultsPerThread = - typename std::conditional::type; - - // Select optimal mode param for SerialGemm. - using bsgModeType = typename std::conditional< - is_vector, - typename std::conditional::type, - typename std::conditional< - on_gpu, Algo::Gemm::Unblocked, - typename std::conditional::type>::type>:: - type; - - if (handle->enableDebug) { - std::cout << "bsgResultsPerThread: " - << typeid(bsgResultsPerThread).name() << std::endl - << "bsgModeType: " << typeid(bsgModeType).name() << std::endl; - } - - // if (on_gpu && c_m >= 20 && - // (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) { - // // TODO: invoke TeamShmem - // } else - if (on_gpu && ((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { - handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); - constexpr size_t alpha_in_fma_thresh = - Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); - - if (c_m % 32 == 0) { // No bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::No, AlphaTag::Yes, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } else { // apply alpha in mul - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::No, AlphaTag::No, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } - } else { // bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::Yes, AlphaTag::Yes, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } else { // apply alpha in mul - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::Yes, AlphaTag::No, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } - } - } else { - ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) - .invoke(); - } - break; - - // case BaseHeuristicAlgos::TALL: - // - // case BaseHeuristicAlgos::WIDE: - ////////////// TPL ALGOS ////////////// -#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 - case BaseTplAlgos::ARMPL: - ret = Impl::BatchedArmplGemm(handle, alpha, A, B, - beta, C) - .invoke(); - break; -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL - // case BaseTplAlgos::MKL: - // - // case GemmTplAlgos::CUBLAS: - // - // case GemmTplAlgos::MAGMA: - - ////////////// KokkosBatched ALGOS ////////////// - case BaseKokkosBatchedAlgos::KK_SERIAL: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); - break; - - // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: - - case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); - break; - - // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: - // case GemmKokkosBatchedAlgos::KK_TEAM: - // case GemmKokkosBatchedAlgos::KK_TEAMVECTOR: - // case GemmKokkosBatchedAlgos::KK_TEAMSIMD: - - case GemmKokkosBatchedAlgos::KK_DBLBUF: - // Note: The tile sizes of 1x1x1 here will not perform well but must be - // selected in order to function on all devices since the serial execution - // space has a max team size of 1. KokkosKernels API users will need to - // follow an approach similar to KK_SQUARE above for best performance. - - // TODO: Add auto-selection of tile size based on inputs and device type - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) - .invoke(); - break; - - default: - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << "." << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - break; - } - return ret; -} -/********************* END non-functor-level routines *********************/ } // namespace KokkosBatched #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_Gemm_TeamVector_Impl.hpp" -#include "KokkosBatched_Gemm_DblBuf_Impl.hpp" -#include "KokkosBatched_Gemm_Armpl_Impl.hpp" -#endif +#endif // __KOKKOSBATCHED_GEMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp new file mode 100644 index 0000000000..4f62d0b0d4 --- /dev/null +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -0,0 +1,107 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ + +// Include explicit specializations of BatchedGemm. +// If ETI_ONLY is disabled, the primary template will +// be inlined into each caller's invocation using non- +// ETI'd template arguments. +#include "KokkosBatched_HostLevel_Gemm_Spec.hpp" + +namespace KokkosBatched { +// clang-format off +/// \brief Non-blocking solve of general matrix multiply on a batch of +/// uniform matrices. +/// +/// Note: If a TPL is selected, this interface follows the blocking +/// behavior (either blocking or non-blocking) of the TPL vendor's API. +/// +/// Note: To leverage SIMD instructions, 4-rank views must be selected via the +/// template parameters documented below. +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchLayout::Left Batch dimension is leftmost +/// BatchLayout::Right Batch dimension is rightmost +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// +/// \param handle [in] A handle which specifies how to invoke the batched +/// gemm. +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchLayout::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchLayout::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchLayout::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedGemm(handle, alpha, A, B, beta, C); +// clang-format on +template +inline int BatchedGemm(BatchedGemmHandleType *const handle, + const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, + const CViewType &C) { + // Minimize the number of ImplBatchedGemmWrapper instantiations, by + // standardizing on particular View specializations for its template + // parameters. + using UnifiedAVT = Kokkos::View< + typename AViewType::value_type ***, typename AViewType::array_layout, + typename AViewType::device_type, Kokkos::MemoryTraits>; + using UnifiedBVT = Kokkos::View< + typename BViewType::value_type ***, typename BViewType::array_layout, + typename BViewType::device_type, Kokkos::MemoryTraits>; + using UnifiedCVT = Kokkos::View>; + + // Go through specialization layer in case ETI'd symbols are available. + return Impl::BatchedGemmSpec::run(handle, alpha, A, B, + beta, C); +} +} // namespace KokkosBatched +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp similarity index 59% rename from batched/dense/src/KokkosBatched_Gemm_Handle.hpp rename to batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp index 6ab94b92d7..95e8f36bc2 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Handle.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp @@ -14,12 +14,8 @@ // //@HEADER -// -// Created by Harvey, Evan on 7/13/21. -// - -#ifndef KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP -#define KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ #include "KokkosBatched_Kernel_Handle.hpp" @@ -56,44 +52,44 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int { // clang-format off /// \brief Handle for selecting runtime behavior of the BatchedGemm interface. /// -/// \var kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE). -/// -/// Specifies whether to select optimal invocations based on inputs and -/// heuristics: -/// SQUARE select invocations based on square matrix heuristics where M=N -/// TALL select invocations based on tall matrix heuristics where M>N -/// WIDE select invocations based on wide matrix heuristics where M= 24. -/// \var teamSz Specifies the team size that will affect any KK algorithm which uses +/// Specifies whether to select optimal invocations based on inputs and +/// heuristics: +/// SQUARE select invocations based on square matrix heuristics where M=N +/// TALL select invocations based on tall matrix heuristics where M>N +/// WIDE select invocations based on wide matrix heuristics where M= 24. +/// \param teamSz Specifies the team size that will affect any KK algorithm which uses /// TeamPolicy (default, Kokkos::AUTO). /// Note: Only applied if useAlgo_type == KK_* -/// \var vecLen Specifies the vector length that will affect any KK algorithm which +/// \param vecLen Specifies the vector length that will affect any KK algorithm which /// uses TeamPolicy and Kokkos::ThreadVectorRange or Kokkos::TeamVectorRange /// (default, Kokkos::AUTO). /// Note: Only applied if useAlgo_type == KK_* @@ -166,4 +162,4 @@ class BatchedGemmHandle : public BatchedKernelHandle { } // namespace KokkosBatched -#endif // KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp index faa6b4f48c..051f78979d 100644 --- a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp +++ b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp @@ -14,13 +14,10 @@ // //@HEADER -// -// Created by Harvey, Evan on 7/13/21. -// - #ifndef KOKKOSKERNELS_KOKKOSBATCHED_KERNEL_HEADER_HPP #define KOKKOSKERNELS_KOKKOSBATCHED_KERNEL_HEADER_HPP +#include #include "KokkosKernels_Error.hpp" #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) @@ -92,55 +89,55 @@ struct TplParams { // clang-format off /// \brief Handle for selecting runtime behavior of the BatchedGemm interface. /// -/// \var kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE). -/// -/// Specifies whether to select optimal invocations based on inputs and -/// heuristics: -/// SQUARE select invocations based on square matrix heuristics where M=N -/// TALL select invocations based on tall matrix heuristics where M>N -/// WIDE select invocations based on wide matrix heuristics where MN +/// WIDE select invocations based on wide matrix heuristics where M>, l>> { // arith traits overload for vector types namespace Kokkos { -namespace Details { // do not use Vector alone as other can use the name. @@ -337,7 +336,6 @@ class ArithTraits< } }; -} // namespace Details } // namespace Kokkos #endif diff --git a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index e938198467..e27419e7c2 100644 --- a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -36,7 +36,7 @@ class Vector, l> { public: using type = Vector, l>; using value_type = T; - using mag_type = typename Kokkos::Details::ArithTraits::mag_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; enum : int { vector_length = l }; diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index c4e09d6e68..d57e671908 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -17,7 +17,8 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" +#include "KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -36,8 +37,7 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using batchLayout = typename ParamTagType::batchLayout; - using view_layout = typename ViewType::array_layout; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; int ret = 0; auto algo_type = batchedGemmHandle->get_kernel_algo_type(); @@ -127,11 +127,6 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual); // Compute c_actual } catch (const std::runtime_error& error) { - bool is_invalid_layout = - (std::is_same::value && - std::is_same::value) || - (std::is_same::value && - std::is_same::value); std::string error_msg = error.what(); if (algo_type == BaseHeuristicAlgos::SQUARE && matCdim1 != matCdim2) { ; @@ -140,17 +135,14 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, auto ninter = batchedGemmHandle->get_tpl_params()[0]; // No runtime errors expected since layout is valid, double is a supported // type, and ninter != 0 - if (!is_invalid_layout && - std::is_same::value && + if (std::is_same::value && ninter != 0) { FAIL() << (error_msg + fmsg + fmsg_rhs); } #else ; // We expect a runtime error if the ARMPL TPL is not enabled #endif - } else if (!is_invalid_layout) { - // No runtime errors expected since we only support certain BatchLayouts - // for LayoutLeft and LayoutRight. + } else { FAIL() << (error_msg + fmsg + fmsg_rhs); } return; @@ -290,11 +282,13 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ViewType a_actual("a_actual", N, matAdim1, matAdim2); ViewType b_actual("b_actual", N, matBdim1, matBdim2); ViewType c_actual("c_actual", N, matCdim1, matCdim2); - using ta = typename ParamTagType::transA; - using tb = typename ParamTagType::transB; - using bl = typename ParamTagType::batchLayout; - BatchedGemm(&batchedGemmHandle, 0.34, a_actual, - b_actual, 0.43, c_actual); + using ta = typename ParamTagType::transA; + using tb = typename ParamTagType::transB; + using bl = typename ParamTagType::batchLayout; + ScalarType alpha = 0.34; + ScalarType beta = 0.43; + BatchedGemm(&batchedGemmHandle, alpha, a_actual, + b_actual, beta, c_actual); std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); FAIL() << fmsg; } catch (const std::runtime_error& error) { @@ -381,32 +375,56 @@ void test_batched_gemm_with_layout(int N) { template int test_batched_gemm() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - typedef Kokkos::View llVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + if constexpr (std::is_same_v) { + using param_tag_type = ::Test::SharedParamTag; + typedef Kokkos::View llVt; + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); + } else { + std::cerr << "TEST SKIPPED since BatchLayout is not Right." << std::endl; + } +#else + std::cerr << "TEST SKIPPED since LayoutLeft is not ETI'd." << std::endl; #endif // KOKKOSKERNELS_INST_LAYOUTLEFT -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - typedef Kokkos::View lrVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + if constexpr (std::is_same_v) { + using param_tag_type = ::Test::SharedParamTag; + typedef Kokkos::View lrVt; + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); + } else { + std::cerr << "TEST SKIPPED since BatchLayout is not Left." << std::endl; + } +#else + std::cerr << "TEST SKIPPED since LayoutRight is not ETI'd." << std::endl; #endif // KOKKOSKERNELS_INST_LAYOUTRIGHT return 0; } diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index 73ff26a8a7..a2b9edf1e6 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -22,8 +22,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { test_batched_gemm, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, @@ -64,8 +56,6 @@ TEST_F(TestCategory, test_batched_gemm, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } #endif @@ -108,8 +92,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { test_batched_gemm, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, @@ -150,8 +126,6 @@ TEST_F(TestCategory, test_batched_gemm, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { typedef ::Test::SharedParamTag, Kokkos::complex, param_tag_type>(); - test_batched_gemm, - Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 3bc48c1aaf..00561e0317 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -13,7 +13,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#if defined(KOKKOS_BHALF_T_IS_FLOAT) + +// We do not ETI half-types. Only test this if ETI ONLY is off +// and bhalf_t is not an alias to float. +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ + defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { @@ -63,8 +60,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT -#if defined(KOKKOS_HALF_T_IS_FLOAT) +// We do not ETI half-types. Only test this if ETI ONLY is off +// and half_t is not an alias to float. +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ + defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { @@ -148,8 +133,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -191,7 +168,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { param_tag_type; test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { @@ -224,7 +197,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { param_tag_type; test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + using param_tag_type = + ::Test::SharedParamTag; test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { @@ -293,7 +258,6 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { param_tag_type; test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { typedef ::Test::SharedParamTag(); - test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp index ebefbbabd2..2bde3f7fad 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp @@ -65,7 +65,7 @@ void impl_test_batched_axpy(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; typedef typename ViewType::const_value_type const_value_type; typedef typename alphaViewType::const_value_type alpha_const_value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index c8f745b006..8304657849 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -81,7 +81,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using value_type = typename ViewType::value_type; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; /// randomized input testing views ScalarType alpha = ScalarType(1.5); diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp index f487515a7c..3b17d81d48 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp @@ -71,10 +71,9 @@ template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using NormViewType = Kokkos::View; diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index c59d1aed1b..d3cbd6c024 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -140,7 +140,7 @@ template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views AViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialLU.hpp b/batched/dense/unit_test/Test_Batched_SerialLU.hpp index 335b4ee9bf..23b72893b2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU.hpp @@ -61,7 +61,7 @@ struct Functor_TestBatchedSerialLU { template void impl_test_batched_lu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index a841cc7ba9..5aa832f0df 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -36,45 +36,35 @@ float svdEpsilon() { } } // namespace Test -template -double simpleNorm2(const Vector& v) { - using Scalar = typename Vector::non_const_value_type; - using KAT = Kokkos::ArithTraits; - auto vhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); - double d = 0; - for (size_t i = 0; i < v.extent(0); i++) { - double m = KAT::abs(vhost(i)); - d += m * m; - } - return std::sqrt(d); -} - +// NOTE: simpleDot and simpleNorm2 currently support only real scalars (OK since +// SVD does as well) template typename V1::non_const_value_type simpleDot(const V1& v1, const V2& v2) { using Scalar = typename V1::non_const_value_type; - using KAT = Kokkos::ArithTraits; - auto v1host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v1); - auto v2host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v2); - typename V1::non_const_value_type val = KAT::zero(); - for (size_t i = 0; i < v1.extent(0); i++) { - val += v1host(i) * v2host(i); - } - return val; + Scalar d; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, v1.extent(0)), + KOKKOS_LAMBDA(int i, Scalar& ld) { ld += v1(i) * v2(i); }, d); + return d; +} +template +typename V::non_const_value_type simpleNorm2(const V& v) { + return Kokkos::sqrt(simpleDot(v, v)); } // Check that all columns of X are unit length and pairwise orthogonal template void verifyOrthogonal(const Mat& X) { - using value_type = typename Mat::non_const_value_type; - int k = X.extent(1); + using Scalar = typename Mat::non_const_value_type; + int k = X.extent(1); for (int i = 0; i < k; i++) { auto col1 = Kokkos::subview(X, Kokkos::ALL(), i); double len = simpleNorm2(col1); - Test::EXPECT_NEAR_KK(len, 1.0, Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(len, 1.0, Test::svdEpsilon()); for (int j = 0; j < i; j++) { auto col2 = Kokkos::subview(X, Kokkos::ALL(), j); - double d = Kokkos::ArithTraits::abs(simpleDot(col1, col2)); - Test::EXPECT_NEAR_KK(d, 0.0, Test::svdEpsilon()); + double d = Kokkos::ArithTraits::abs(simpleDot(col1, col2)); + Test::EXPECT_NEAR_KK(d, 0.0, Test::svdEpsilon()); } } } @@ -82,8 +72,8 @@ void verifyOrthogonal(const Mat& X) { template void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView& sigma) { - using value_type = typename AView::non_const_value_type; - using KAT = Kokkos::ArithTraits; + using Scalar = typename AView::non_const_value_type; + using KAT = Kokkos::ArithTraits; // Check that U/V columns are unit length and orthogonal, and that U * // diag(sigma) * V^T == A int m = A.extent(0); @@ -93,7 +83,7 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, // NOTE: V^T being square and orthonormal implies that V is, so we don't have // to transpose it here. verifyOrthogonal(Vt); - AView usvt("USV^T", m, n); + Kokkos::View usvt("USV^T", m, n); for (int i = 0; i < maxrank; i++) { auto Ucol = Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair(i, i + 1)); @@ -103,7 +93,7 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, } for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { - Test::EXPECT_NEAR_KK(usvt(i, j), A(i, j), Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(usvt(i, j), A(i, j), Test::svdEpsilon()); } } // Make sure all singular values are positive @@ -389,11 +379,86 @@ void testSVD() { testSerialSVDSingularValuesOnly(10, 8); } +template +KOKKOS_INLINE_FUNCTION constexpr auto Determinant(ViewT F) + -> std::enable_if_t::value && ViewT::rank == 2, + double> { + return (F(0, 0) * F(1, 1) * F(2, 2) + F(0, 1) * F(1, 2) * F(2, 0) + + F(0, 2) * F(1, 0) * F(2, 1) - + (F(0, 2) * F(1, 1) * F(2, 0) + F(0, 1) * F(1, 0) * F(2, 2) + + F(0, 0) * F(1, 2) * F(2, 1))); +} + +template +void GenerateTestData(ViewT data) { + using memory_space = typename ExeSpace::memory_space; + // finite difference should return dPK2dU. So, we can analyze two cases. + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(data, random, 1.0); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, data.extent(0)), KOKKOS_LAMBDA(int i) { + auto data_i = Kokkos::subview(data, i, Kokkos::ALL(), Kokkos::ALL()); + while (Determinant(data_i) < 0.5) { + data_i(0, 0) += 1.0; + data_i(1, 1) += 1.0; + data_i(2, 2) += 1.0; + } + }); +} + +template +void testIssue1786() { + using memory_space = typename ExeSpace::memory_space; + constexpr int num_tests = 4; + Kokkos::View matrices("data", + num_tests); + GenerateTestData(matrices); + Kokkos::View Us("Us", + matrices.extent(0)); + Kokkos::View Ss("Ss", matrices.extent(0)); + Kokkos::View Vts("Vts", + matrices.extent(0)); + // Make sure the 2nd dimension of works is contiguous + Kokkos::View works( + "works", matrices.extent(0)); + Kokkos::View matrices_copy( + "matrices_copy", matrices.extent(0)); + // make a copy of the input data to avoid overwriting it + Kokkos::deep_copy(matrices_copy, matrices); + auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); + Kokkos::parallel_for( + "polar decomposition", policy, KOKKOS_LAMBDA(int i) { + auto matrix_copy = + Kokkos::subview(matrices_copy, i, Kokkos::ALL(), Kokkos::ALL()); + auto U = Kokkos::subview(Us, i, Kokkos::ALL(), Kokkos::ALL()); + auto S = Kokkos::subview(Ss, i, Kokkos::ALL()); + auto Vt = Kokkos::subview(Vts, i, Kokkos::ALL(), Kokkos::ALL()); + auto work = Kokkos::subview(works, i, Kokkos::ALL()); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag{}, + matrix_copy, U, S, Vt, work); + }); + + auto Us_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Us); + auto Ss_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Ss); + auto Vts_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Vts); + auto matrices_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, matrices); + for (int i = 0; i < num_tests; i++) { + auto A = Kokkos::subview(matrices_h, i, Kokkos::ALL(), Kokkos::ALL()); + auto U = Kokkos::subview(Us_h, i, Kokkos::ALL(), Kokkos::ALL()); + auto S = Kokkos::subview(Ss_h, i, Kokkos::ALL()); + auto Vt = Kokkos::subview(Vts_h, i, Kokkos::ALL(), Kokkos::ALL()); + verifySVD(A, U, Vt, S); + } +} + #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_svd_double) { // Test general SVD on a few different input sizes (full rank randomized) testSVD(); testSVD(); + testIssue1786(); + testIssue1786(); } #endif @@ -402,5 +467,7 @@ TEST_F(TestCategory, batched_scalar_serial_svd_float) { // Test general SVD on a few different input sizes (full rank randomized) testSVD(); testSVD(); + testIssue1786(); + testIssue1786(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp index dd39be3dd1..48e8e5dead 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp @@ -139,7 +139,7 @@ struct Functor_TestBatchedSerialSolveLU { template void impl_test_batched_solvelu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index b688a46e2e..af38e62e4d 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -22,6 +22,8 @@ #include "KokkosKernels_TestUtils.hpp" +#include + using namespace KokkosBatched; namespace Test { @@ -59,7 +61,7 @@ struct VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -148,7 +150,7 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, const char* trans) { typedef typename ViewType::value_type value_type; typedef typename DeviceType::execution_space execution_space; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ScalarType alpha(1.0); ScalarType beta(0.0); @@ -165,7 +167,8 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, Kokkos::create_mirror_view(B_actual); typename ViewType::HostMirror B_expected_host = Kokkos::create_mirror_view(B_expected); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp index f109c44e0b..c0ef098652 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp @@ -75,7 +75,7 @@ template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.0); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp index 9dc003dd19..f05a6f7fa5 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp @@ -74,7 +74,7 @@ template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.5); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index d9fb714008..8f4ae64b7e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -22,6 +22,8 @@ #include "KokkosKernels_TestUtils.hpp" +#include + #define PRINT_MAT 0 using namespace KokkosBatched; @@ -61,7 +63,7 @@ struct VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -141,7 +143,7 @@ template ats; + typedef Kokkos::ArithTraits ats; ScalarType alpha(1.0); ScalarType beta(0.0); @@ -161,7 +163,8 @@ void impl_test_batched_trtri(const int N, const int K) { typename ViewType::HostMirror I_host = Kokkos::create_mirror_view(A_I); typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp index 873b244bd8..7941fc0284 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp @@ -77,7 +77,7 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename ViewType::value_type value_type; typedef typename ViewType::const_value_type const_value_type; typedef typename alphaViewType::const_value_type alpha_const_value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp index c60552827e..9023a009af 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp @@ -90,7 +90,7 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; using value_type = typename ViewType::value_type; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp index 2ecdb60bcf..89f67e2731 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp @@ -83,10 +83,9 @@ template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using NormViewType = Kokkos::View; diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp index 4a0d60ba2f..8657de9856 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp @@ -161,7 +161,7 @@ template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views AViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamLU.hpp b/batched/dense/unit_test/Test_Batched_TeamLU.hpp index f7ac07ce46..04e191b9cb 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU.hpp @@ -69,7 +69,7 @@ struct Functor_TestBatchedTeamLU { template void impl_test_batched_lu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp index 77b3d697a8..41287f9b52 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp @@ -155,7 +155,7 @@ struct Functor_TestBatchedTeamSolveLU { template void impl_test_batched_solvelu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp index 63effa103c..2f7781745d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp @@ -85,7 +85,7 @@ template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp index 4d20bbc7cf..bb00b78736 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp @@ -83,7 +83,7 @@ template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.5); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp index 83d47edf8f..5ea8a80717 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp @@ -78,7 +78,7 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename ViewType::value_type value_type; typedef typename ViewType::const_value_type const_value_type; typedef typename alphaViewType::const_value_type alpha_const_value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp index d3aa42c49a..327f28353e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp @@ -85,7 +85,7 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; using value_type = typename ViewType::value_type; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp index 8392e1b9fc..2026f2f81d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp @@ -84,10 +84,9 @@ template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using NormViewType = Kokkos::View; diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp index d7e237094d..58d305f494 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp @@ -78,7 +78,7 @@ struct Functor_TestBatchedTeamVectorQR { member.team_barrier(); /// xx = bb; - TeamVectorCopy::invoke(member, bb, xx); + TeamVectorCopy::invoke(member, bb, xx); member.team_barrier(); /// xx = Q^{T}xx; @@ -110,7 +110,7 @@ template void impl_test_batched_qr(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const value_type one(1); /// randomized input testing views MatrixViewType a("a", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 648ae43566..c86d4e86a8 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -80,7 +80,7 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// xx = bb; - TeamVectorCopy::invoke(member, bb, xx); + TeamVectorCopy::invoke(member, bb, xx); member.team_barrier(); /// xx = Q^{T} xx; @@ -119,7 +119,7 @@ template void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType a("a", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp index 9d080e6e48..29496c1b87 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp @@ -132,7 +132,7 @@ template void impl_test_batched_solve_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType r("r", N, BlkSize, 3); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp index 0d52d0d0e4..45d6093f2a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp @@ -136,7 +136,7 @@ template void impl_test_batched_solve_utv2(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType r("r", N, BlkSize, 3); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp index f61cf2729a..527c93e059 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp @@ -166,7 +166,7 @@ template void impl_test_batched_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType r("r", N, BlkSize, 3); diff --git a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp index 0c0c80f7b8..1006325f94 100644 --- a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp @@ -52,10 +52,10 @@ void impl_test_complex_real_imag_value() { a[k].imag() = k * 5 + 4; } - const auto a_real = Kokkos::Details::ArithTraits::real(a); - const auto a_imag = Kokkos::Details::ArithTraits::imag(a); + const auto a_real = Kokkos::ArithTraits::real(a); + const auto a_imag = Kokkos::ArithTraits::imag(a); - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); for (int k = 0; k < vector_length; ++k) { EXPECT_NEAR(a[k].real(), a_real[k], eps); @@ -71,7 +71,7 @@ void impl_test_batched_vector_arithmatic() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_type a, b, c; diff --git a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp index a740bac9dd..9393afd77b 100644 --- a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp @@ -45,7 +45,7 @@ void impl_test_batched_vector_logical() { typedef ValueType value_type; const int vector_length = VectorLength; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_int_type a, b; diff --git a/batched/dense/unit_test/Test_Batched_VectorMath.hpp b/batched/dense/unit_test/Test_Batched_VectorMath.hpp index da0556fc0f..d2aa9eb7bc 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMath.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMath.hpp @@ -46,7 +46,7 @@ void impl_test_batched_vector_math() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_type a, b, aref, bref; @@ -136,7 +136,7 @@ int test_batched_vector_math() { // template // int test_complex_pow() { -// typedef Kokkos::Details::ArithTraits > ats; +// typedef Kokkos::ArithTraits > ats; // typedef typename ats::mag_type mag_type; // const mag_type eps = 1.0e3 * ats::epsilon(); diff --git a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp index e465af5417..70d0e10cd2 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp @@ -46,7 +46,7 @@ void impl_test_batched_vector_misc() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_type a, b, c; diff --git a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp index bf6c76d1ec..54eb2938e5 100644 --- a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp @@ -46,7 +46,7 @@ void impl_test_batched_vector_relation() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; vector_type a, b; diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index a5b752b3d1..793c4ac3f3 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -67,7 +67,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl], @@ -79,7 +79,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl], @@ -91,7 +91,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl], @@ -103,7 +103,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl], @@ -115,7 +115,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl], @@ -127,7 +127,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl], @@ -139,7 +139,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl], @@ -151,7 +151,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl], @@ -382,9 +382,19 @@ TEST_F(TestCategory, batched_vector_view_simd_scomplex8) { TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) { test_batched_vector_view >, 2>(); } + +#if defined(KOKKOS_COMPILER_INTEL) && \ + ((KOKKOS_COMPILER_INTEL > 1900) && (KOKKOS_COMPILER_INTEL <= 2021)) +TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { + printf( + "Skipped: intel compiler version > 19.0.05 && <= 2021\n" + "See https://github.com/kokkos/kokkos-kernels/issues/1673."); +} +#else TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { test_batched_vector_view >, 4>(); } -#endif +#endif // KOKKOS_COMPILER_INTEL +#endif // KOKKOSKERNELS_INST_COMPLEX_DOUBLE #endif // check to not include this in a device test diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..a8a05850e4 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..01525f4031 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..c026119b97 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..9e1eba730d --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..86aa818b42 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..bd8d246708 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in similarity index 71% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in rename to batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in index c7af4806be..450d7bb5f6 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in @@ -13,14 +13,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER - -#ifndef KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { namespace Impl { - -@BLAS1_ABS_MV_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in similarity index 71% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in rename to batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in index 2780dee8ff..95a2faf3d7 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in @@ -13,14 +13,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER - -#ifndef KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_HostLevel_Gemm.hpp" +namespace KokkosBatched { namespace Impl { - -@BLAS1_ABS_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..6bdcb095f4 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLL_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..f0098ff1f0 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in similarity index 74% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in rename to batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in index da7f48f325..b7efe9f5d4 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in @@ -14,11 +14,12 @@ // //@HEADER -#ifndef KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -@BLAS1_DOT_MV_ETI_DECL_BLOCK@ +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos -#endif +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..4ef39901f9 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..f40acc60b1 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLL_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..a8e23a5169 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in similarity index 74% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in rename to batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in index 3f8cfa92d5..33e865fceb 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in @@ -14,11 +14,12 @@ // //@HEADER -#ifndef KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -@BLAS1_AXPBY_ETI_DECL_BLOCK@ +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos -#endif +#endif \ No newline at end of file diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in similarity index 74% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in rename to batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in index 44b6708c99..f81d3d6c53 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in @@ -13,12 +13,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -@BLAS1_AXPBY_MV_ETI_DECL_BLOCK@ +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos -#endif +#endif \ No newline at end of file diff --git a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 030e452249..c11ad96959 100644 --- a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -43,7 +43,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const VectorViewType& _X, const KrylovHandleType& handle, const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; const size_t maximum_iteration = handle.get_max_iteration(); @@ -179,7 +179,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( typename VectorViewType::array_layout, typename VectorViewType::execution_space::scratch_memory_space>; using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; @@ -201,7 +201,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const int last_matrix = handle.last_index(member.league_rank()); using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; diff --git a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index aa528322ad..bf2f1d2e86 100644 --- a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -41,7 +41,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( const VectorViewType& _X, const KrylovHandle& handle, const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; size_t maximum_iteration = handle.get_max_iteration(); @@ -177,7 +177,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( typename VectorViewType::array_layout, typename VectorViewType::execution_space::scratch_memory_space>; using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; @@ -199,7 +199,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( const int last_matrix = handle.last_index(member.league_rank()); using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 071b2d6634..923b67c105 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -45,9 +45,9 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const KrylovHandleType& handle, const int GMRES_id) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; + typedef Kokkos::ArithTraits ATM; using SerialCopy1D = SerialCopy; using SerialCopy2D = SerialCopy; diff --git a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index e76d8c4239..a7219ecc91 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -48,9 +48,9 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; + typedef Kokkos::ArithTraits ATM; using TeamVectorCopy1D = TeamVectorCopy; diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 15cb7bdca9..bb8f446f07 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -47,9 +47,9 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; + typedef Kokkos::ArithTraits ATM; using TeamCopy1D = TeamCopy; diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index d5a19cb56b..b7527d923c 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -138,17 +138,17 @@ struct SerialSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -215,11 +215,11 @@ struct SerialSpmv { template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) @@ -232,13 +232,13 @@ struct SerialSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -277,7 +277,7 @@ struct SerialSpmv { #endif return SerialSpmvInternal::template invoke< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 6f04427924..2b62be1e5a 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -19,6 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosSparse_spmv_team.hpp" namespace KokkosBatched { @@ -321,21 +322,21 @@ struct TeamVectorSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -386,6 +387,12 @@ struct TeamVectorSpmv { return 1; } #endif + if (values.extent(0) == 1) { + return KokkosSparse::Experimental::team_vector_spmv( + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), + row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), + beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + } return TeamVectorSpmvInternal::template invoke< MemberType, typename alphaViewType::non_const_value_type, @@ -403,11 +410,11 @@ struct TeamVectorSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) @@ -420,13 +427,13 @@ struct TeamVectorSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -463,10 +470,16 @@ struct TeamVectorSpmv { return 1; } #endif + if (values.extent(0) == 1) { + return KokkosSparse::Experimental::team_vector_spmv( + member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, + colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, + Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + } return TeamVectorSpmvInternal::template invoke< MemberType, - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index bf2f0a82e7..c46ef7edc7 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -19,6 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosSparse_spmv_team.hpp" namespace KokkosBatched { @@ -176,17 +177,17 @@ struct TeamSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -237,6 +238,12 @@ struct TeamSpmv { return 1; } #endif + if (values.extent(0) == 1) { + return KokkosSparse::Experimental::team_spmv( + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), + row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), + beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + } return TeamSpmvInternal::template invoke< MemberType, typename alphaViewType::non_const_value_type, @@ -254,11 +261,11 @@ struct TeamSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) @@ -271,13 +278,13 @@ struct TeamSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -314,10 +321,16 @@ struct TeamSpmv { return 1; } #endif + if (values.extent(0) == 1) { + return KokkosSparse::Experimental::team_spmv( + member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, + colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, + Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + } return TeamSpmvInternal::template invoke< MemberType, - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, diff --git a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index 4ceddeea44..92acc91a9e 100644 --- a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -29,9 +29,8 @@ namespace KokkosBatched { template class CrsMatrix { public: - using ScalarType = typename ValuesViewType::non_const_value_type; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using ScalarType = typename ValuesViewType::non_const_value_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; private: ValuesViewType values; @@ -82,10 +81,9 @@ class CrsMatrix { typename XViewType, typename YViewType> KOKKOS_INLINE_FUNCTION void apply( const MemberType &member, const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), - MagnitudeType beta = - Kokkos::Details::ArithTraits::zero()) const { - if (beta == Kokkos::Details::ArithTraits::zero()) { + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + if (beta == Kokkos::ArithTraits::zero()) { if (member.team_size() == 1 && n_operators == 8) KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( @@ -109,10 +107,9 @@ class CrsMatrix { template KOKKOS_INLINE_FUNCTION void apply( const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), - MagnitudeType beta = - Kokkos::Details::ArithTraits::zero()) const { - if (beta == Kokkos::Details::ArithTraits::zero()) + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + if (beta == Kokkos::ArithTraits::zero()) KokkosBatched::SerialSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( alpha, values, row_ptr, colIndices, X, beta, Y); diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 1185ec94d4..728bb2d921 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -29,9 +29,8 @@ namespace KokkosBatched { template class JacobiPrec { public: - using ScalarType = typename ValuesViewType::non_const_value_type; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using ScalarType = typename ValuesViewType::non_const_value_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; private: ValuesViewType diag_values; @@ -55,8 +54,8 @@ class JacobiPrec { template KOKKOS_INLINE_FUNCTION void computeInverse(const MemberType &member) const { - auto one = Kokkos::Details::ArithTraits::one(); - auto epsilon = Kokkos::Details::ArithTraits::epsilon(); + auto one = Kokkos::ArithTraits::one(); + auto epsilon = Kokkos::ArithTraits::epsilon(); int tooSmall = 0; if (std::is_same::value) { for (int i = 0; i < n_operators; ++i) @@ -118,8 +117,8 @@ class JacobiPrec { } KOKKOS_INLINE_FUNCTION void computeInverse() const { - auto one = Kokkos::Details::ArithTraits::one(); - auto epsilon = Kokkos::Details::ArithTraits::epsilon(); + auto one = Kokkos::ArithTraits::one(); + auto epsilon = Kokkos::ArithTraits::epsilon(); int tooSmall = 0; for (int i = 0; i < n_operators; ++i) diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index 2ea489d307..9992742dd8 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -87,7 +87,7 @@ class KrylovHandle { batched_size(_batched_size), N_team(_N_team), monitor_residual(_monitor_residual) { - tolerance = Kokkos::Details::ArithTraits::epsilon(); + tolerance = Kokkos::ArithTraits::epsilon(); max_tolerance = 1e-30; if (std::is_same::value) max_tolerance = 1e-50; if (monitor_residual) { @@ -409,7 +409,7 @@ class KrylovHandle { /// \brief set_norm /// Store the norm of one of the system at one of the iteration /// - /// \param batchedteam_id [in]: Team ID + /// \param team_id [in]: Team ID /// \param batched_id [in]: Local batched ID (local ID within the team) /// \param iteration_id [in]: Iteration ID /// \param norm_i [in]: Norm to store @@ -436,9 +436,8 @@ class KrylovHandle { /// \brief set_last_norm /// Store the last norm of one system /// - /// \param batchedteam_id [in]: Team ID + /// \param team_id [in]: Team ID /// \param batched_id [in]: Local batched ID (local ID within the team) - /// \param batched_id [in]: Global batched ID /// \param norm_i [in]: Norm to store KOKKOS_INLINE_FUNCTION @@ -461,7 +460,7 @@ class KrylovHandle { /// \brief set_iteration /// Store the number of iteration after convergence for one system /// - /// \param batchedteam_id [in]: Team ID + /// \param team_id [in]: Team ID /// \param batched_id [in]: Local batched ID (local ID within the team) /// \param iteration_id [in]: Iteration ID diff --git a/batched/sparse/src/KokkosBatched_Spmv.hpp b/batched/sparse/src/KokkosBatched_Spmv.hpp index 9debd0bc27..da70acb6bb 100644 --- a/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -75,11 +75,11 @@ struct SerialSpmv { template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &Y); }; @@ -139,11 +139,11 @@ struct TeamSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &y); }; @@ -205,11 +205,11 @@ struct TeamVectorSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &y); }; @@ -276,11 +276,11 @@ struct Spmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &y) { int r_val = 0; diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp index c8833d27df..45b6a71f99 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp @@ -109,7 +109,7 @@ template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -125,9 +125,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp index 0aeb69fbc5..338a93d0eb 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp @@ -86,7 +86,7 @@ template void impl_test_batched_spmv(const int N, const int BlkSize) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp index d6aa0cc949..41fa682bdd 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp @@ -95,7 +95,7 @@ template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -110,9 +110,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp index a0ef9bdd4f..2b7ab73790 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp @@ -133,7 +133,7 @@ template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -149,9 +149,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp index 2efc3e9786..5c077f75ed 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp @@ -111,7 +111,7 @@ template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp index d326d1429d..abadf27953 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp @@ -97,7 +97,7 @@ template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -112,9 +112,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp index d62e814e91..f4f208a829 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp @@ -133,7 +133,7 @@ template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -149,9 +149,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp index d54f3c20e4..67d944b159 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp @@ -119,7 +119,7 @@ template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index e8a90c38cf..04f883c21a 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -290,6 +290,13 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_gemv gemv TYPE_LISTS FLOATS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Blas2_ger ger + COMPONENTS blas + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in similarity index 75% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in rename to blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in index a2da20787d..edfdef0a93 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in +++ b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in @@ -14,12 +14,12 @@ // //@HEADER -#ifndef KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosBlas2_ger_spec.hpp" namespace KokkosBlas { namespace Impl { -@BLAS1_SCAL_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif +@BLAS2_GER_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBlas diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in deleted file mode 100644 index 17b61a8857..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_IAMAX_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 35d654012e..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_IAMAX_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in deleted file mode 100644 index 406feeaf5d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_MULT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in deleted file mode 100644 index a59f2af39d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_MULT_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in deleted file mode 100644 index 1c9a088122..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM1_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in deleted file mode 100644 index d2a322a0ad..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM1_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in deleted file mode 100644 index dfe891afc9..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 2e0f745682..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in deleted file mode 100644 index bd7d1b11b8..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2W_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 0a0aadc87a..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2W_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in deleted file mode 100644 index 3f1e874724..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRMINF_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 17559306bf..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRMINF_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in deleted file mode 100644 index 7ac4b74ea4..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_RECIPROCAL_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in deleted file mode 100644 index f40958465f..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_RECIPROCAL_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in deleted file mode 100644 index e410696d54..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROTG_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in deleted file mode 100644 index bd88a1e4c6..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROTM_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 953f8e6954..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_SCAL_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 5182f61985..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_SUM_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in deleted file mode 100644 index e795c8fb9c..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -*/ -#ifndef KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS1_SWAP_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in deleted file mode 100644 index cff04c9fbe..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_UPDATE_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in deleted file mode 100644 index deec84712b..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_UPDATE_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in deleted file mode 100644 index 9d69383b3d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS2_GEMV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in similarity index 79% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in rename to blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in index b69e9b6b4b..a456744bd1 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in @@ -14,12 +14,12 @@ // //@HEADER -#ifndef KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ +#ifndef KOKKOSBLAS2_GER_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL_HPP_ namespace KokkosBlas { namespace Impl { -@BLAS1_ROTMG_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos +@BLAS2_GER_ETI_AVAIL_BLOCK@ +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in deleted file mode 100644 index 22ea9a1ed1..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS3_GEMM_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in deleted file mode 100644 index e802ccf4fc..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS3_TRMM_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in deleted file mode 100644 index 11ca605f4f..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS3_TRSM_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in deleted file mode 100644 index 1bd8c9da19..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS_TRTRI_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS_TRTRI_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS_TRTRI_ETI_SPEC_DECL_HPP_ diff --git a/blas/impl/KokkosBlas1_abs_impl.hpp b/blas/impl/KokkosBlas1_abs_impl.hpp index a1b86cffb7..0334adbafe 100644 --- a/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/blas/impl/KokkosBlas1_abs_impl.hpp @@ -30,9 +30,8 @@ namespace Impl { // Entry-wise absolute value / magnitude: R(i,j) = abs(X(i,j)). template struct MV_Abs_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -68,9 +67,8 @@ struct MV_Abs_Functor { // Entry-wise, in-place absolute value / magnitude: R(i,j) = abs(R(i,j)). template struct MV_AbsSelf_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -98,9 +96,8 @@ struct MV_AbsSelf_Functor { // Single-vector, entry-wise absolute value / magnitude: R(i) = abs(X(i)). template struct V_Abs_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; XV X_; @@ -128,9 +125,8 @@ struct V_Abs_Functor { // abs(R(i)). template struct V_AbsSelf_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; @@ -149,8 +145,8 @@ struct V_AbsSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise absolute value. -template -void MV_Abs_Generic(const RMV& R, const XMV& X) { +template +void MV_Abs_Generic(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Abs_Generic: RMV is not a Kokkos::View."); @@ -164,9 +160,8 @@ void MV_Abs_Generic(const RMV& R, const XMV& X) { "KokkosBlas::Impl::" "MV_Abs_Generic: XMV is not rank 2"); - typedef typename XMV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) @@ -179,8 +174,8 @@ void MV_Abs_Generic(const RMV& R, const XMV& X) { } // Variant of MV_Abs_Generic for single vectors (1-D Views) R and X. -template -void V_Abs_Generic(const RV& R, const XV& X) { +template +void V_Abs_Generic(const execution_space& space, const RV& R, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Abs_Generic: RV is not a Kokkos::View."); @@ -194,9 +189,8 @@ void V_Abs_Generic(const RV& R, const XV& X) { "KokkosBlas::Impl::" "V_Abs_Generic: XV is not rank 1"); - typedef typename XV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index 525d1e9ee8..a4695bd505 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct abs_eti_spec_avail { enum : bool { value = false }; }; @@ -45,6 +45,7 @@ struct abs_eti_spec_avail { #define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct abs_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct abs_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = abs_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = RMV::rank, + bool tpl_spec_avail = abs_tpl_spec_avail::value, + bool eti_spec_avail = abs_eti_spec_avail::value> struct Abs { - static void abs(const RMV& R, const XMV& X); + static void abs(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Abs for single vectors (1-D Views). -template -struct Abs { - typedef typename XMV::size_type size_type; +template +struct Abs { + using size_type = typename XMV::size_type; - static void abs(const RMV& R, const XMV& X) { + static void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Abs<1-D>: RMV is not a Kokkos::View."); @@ -125,20 +129,21 @@ struct Abs { if (numRows < static_cast(INT_MAX)) { typedef int index_type; - V_Abs_Generic(R, X); + V_Abs_Generic(space, R, X); } else { typedef std::int64_t index_type; - V_Abs_Generic(R, X); + V_Abs_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Abs { - typedef typename XMV::size_type size_type; +template +struct Abs { + using size_type = typename XMV::size_type; - static void abs(const RMV& R, const XMV& X) { + static void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Abs<2-D>: RMV is not a Kokkos::View."); @@ -169,10 +174,10 @@ struct Abs { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Abs_Generic(R, X); + MV_Abs_Generic(space, R, X); } else { typedef std::int64_t index_type; - MV_Abs_Generic(R, X); + MV_Abs_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -191,6 +196,7 @@ struct Abs { // #define KOKKOSBLAS1_ABS_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { // #define KOKKOSBLAS1_ABS_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_ABS_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { 2, false, true>; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_ diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index e159ea798e..4e468b0e56 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -54,7 +54,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; XV m_x; YV m_y; @@ -75,10 +75,10 @@ struct Axpby_Functor { "KokkosBlas::Impl::Axpby_Functor: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YV::Rank == (int)XV::Rank, + static_assert((int)YV::rank == (int)XV::rank, "KokkosBlas::Impl::" "Axpby_Functor: X and Y must have the same rank."); - static_assert(YV::Rank == 1, + static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Functor: " "XV and YV must have rank 1."); @@ -188,7 +188,7 @@ struct Axpby_Functor { typedef typename YV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; XV m_x; YV m_y; @@ -211,10 +211,10 @@ struct Axpby_Functor -void Axpby_Generic(const AV& av, const XV& x, const BV& bv, const YV& y, - const SizeType startingColumn, int a = 2, int b = 2) { +template +void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, + const BV& bv, const YV& y, const SizeType startingColumn, + int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: X is not a Kokkos::View."); @@ -316,16 +318,15 @@ void Axpby_Generic(const AV& av, const XV& x, const BV& bv, const YV& y, "KokkosBlas::Impl::Axpby_Generic: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YV::Rank == (int)XV::Rank, + static_assert((int)YV::rank == (int)XV::rank, "KokkosBlas::Impl::" "Axpby_Generic: X and Y must have the same rank."); - static_assert(YV::Rank == 1, + static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Generic: " "XV and YV must have rank 1."); - typedef typename YV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0 && b == 0) { Axpby_Functor op(x, y, av, bv, diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 5853106823..32653b9cce 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -43,9 +43,8 @@ namespace Impl { template struct Axpby_MV_Functor { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; XMV m_x; @@ -74,16 +73,16 @@ struct Axpby_MV_Functor { "KokkosBlas::Impl::Axpby_MV_Functor: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::Axpby_MV_Functor: " "X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Functor: " "XMV and YMV must have rank 2."); - static_assert(AV::Rank == 1, + static_assert(AV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Functor: " "AV must have rank 1."); - static_assert(BV::Rank == 1, + static_assert(BV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Functor: " "BV must have rank 1."); } @@ -286,9 +285,8 @@ template struct Axpby_MV_Functor { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; XMV m_x; @@ -311,10 +309,10 @@ struct Axpby_MV_Functor struct Axpby_MV_Unroll_Functor { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; XMV m_x; YMV m_y; @@ -529,16 +526,16 @@ struct Axpby_MV_Unroll_Functor { "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " "X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " "XMV and YMV must have rank 2."); - static_assert(AV::Rank == 1, + static_assert(AV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " "AV must have rank 1."); - static_assert(BV::Rank == 1, + static_assert(BV::rank == 1, "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " "BV must have rank 1."); @@ -728,9 +725,8 @@ template { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; XMV m_x; YMV m_y; @@ -753,10 +749,10 @@ struct Axpby_MV_Unroll_Functor -void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y, +template +void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, const SizeType startingColumn, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -965,16 +963,15 @@ void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y, "KokkosBlas::Impl::Axpby_MV_Unrolled: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::" "Axpby_MV_Unrolled: X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Unrolled: " "XMV and YMV must have rank 2."); - typedef typename YMV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0 && b == 0) { Axpby_MV_Unroll_Functor op( @@ -1106,9 +1103,10 @@ void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template -void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y, - int a = 2, int b = 2) { +template +void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Generic: X is not a Kokkos::View."); @@ -1120,16 +1118,15 @@ void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y, "KokkosBlas::Impl::Axpby_MV_Generic: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::" "Axpby_MV_Generic: X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Generic: " "XMV and YMV must have rank 2."); - typedef typename YMV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0 && b == 0) { Axpby_MV_Functor op(x, y, av, bv); @@ -1245,10 +1242,11 @@ void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Left { - static void run(const AV& av, const XMV& x, const BV& bv, const YMV& y, - int a = 2, int b = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Left: X is not a Kokkos::View."); @@ -1260,10 +1258,10 @@ struct Axpby_MV_Invoke_Left { "KokkosBlas::Impl::Axpby_MV_Invoke_Left: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Left: X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Left: " "X and Y must have rank 2."); @@ -1280,8 +1278,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled(av, X_cur, bv, Y_cur, j, - a, b); + Axpby_MV_Unrolled( + space, av, X_cur, bv, Y_cur, j, a, b); } for (; j + 4 <= numCols; j += 4) { XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4)); @@ -1290,8 +1288,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled(av, X_cur, bv, Y_cur, j, - a, b); + Axpby_MV_Unrolled( + space, av, X_cur, bv, Y_cur, j, a, b); } for (; j < numCols; ++j) { auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j); @@ -1302,7 +1300,8 @@ struct Axpby_MV_Invoke_Left { // the functor doesn't have to do anything to them. typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; - Axpby_Generic(av, x_cur, bv, y_cur, j, a, b); + Axpby_Generic( + space, av, x_cur, bv, y_cur, j, a, b); } } }; @@ -1326,10 +1325,11 @@ struct Axpby_MV_Invoke_Left { // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Right { - static void run(const AV& av, const XMV& x, const BV& bv, const YMV& y, - int a = 2, int b = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Right: X is not a Kokkos::View."); @@ -1341,10 +1341,10 @@ struct Axpby_MV_Invoke_Right { "KokkosBlas::Impl::Axpby_MV_Invoke_Right: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Right: X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby_MV_Invoke_Right: " "X and Y must have rank 2."); @@ -1354,9 +1354,11 @@ struct Axpby_MV_Invoke_Right { auto y_0 = Kokkos::subview(y, Kokkos::ALL(), 0); typedef decltype(x_0) XV; typedef decltype(y_0) YV; - Axpby_Generic(av, x_0, bv, y_0, 0, a, b); + Axpby_Generic( + space, av, x_0, bv, y_0, 0, a, b); } else { - Axpby_MV_Generic(av, x, bv, y, a, b); + Axpby_MV_Generic( + space, av, x, bv, y, a, b); } } }; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index e9c1fb645a..da2924c9f3 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -28,7 +28,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_eti_spec_avail { enum : bool { value = false }; }; @@ -46,7 +47,7 @@ struct axpby_eti_spec_avail { MEM_SPACE) \ template <> \ struct axpby_eti_spec_avail< \ - SCALAR, \ + EXEC_SPACE, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -68,7 +69,7 @@ struct axpby_eti_spec_avail { MEM_SPACE) \ template <> \ struct axpby_eti_spec_avail< \ - SCALAR, \ + EXEC_SPACE, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -80,6 +81,7 @@ struct axpby_eti_spec_avail { }; \ template <> \ struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -127,28 +129,35 @@ namespace Impl { /// Any scalar coefficient of zero has BLAS semantics of /// ignoring the corresponding (multi)vector entry. This does NOT /// apply to coefficients in av and bv vectors, if they are used. -template ::value, - bool eti_spec_avail = axpby_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + axpby_eti_spec_avail::value> struct Axpby { - static void axpby(const AV& av, const XMV& X, const BV& bv, const YMV& Y); + static void axpby(const execution_space& space, const AV& av, const XMV& X, + const BV& bv, const YMV& Y); }; -template -struct Axpby { - static void axpby(const AV& /* av */, const XMV& /* X */, const BV& /* bv */, +template +struct Axpby { + static void axpby(const execution_space& /*space*/, const AV& /* av */, + const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) { - static_assert(YMV::Rank == 0, "Oh My God"); + static_assert(YMV::rank == 0, "Oh My God"); } }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Full specialization for XMV and YMV rank-2 Views. -template -struct Axpby { +template +struct Axpby { typedef typename YMV::size_type size_type; - static void axpby(const AV& av, const XMV& X, const BV& bv, const YMV& Y) { + static void axpby(const execution_space& space, const AV& av, const XMV& X, + const BV& bv, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); @@ -160,10 +169,10 @@ struct Axpby { "KokkosBlas::Impl::Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::Axpby::axpby (MV): " "X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby::axpby: " "X and Y must have rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -199,18 +208,18 @@ struct Axpby { typedef int index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b); + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } else { typedef typename XMV::size_type index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b); + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } Kokkos::Profiling::popRegion(); } @@ -218,18 +227,18 @@ struct Axpby { // Partial specialization for XMV, and YMV rank-2 Views, // and AV and BV scalars. -template -struct Axpby +struct Axpby { typedef typename XMV::non_const_value_type AV; typedef typename YMV::non_const_value_type BV; typedef typename YMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATB; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; - static void axpby(const AV& alpha, const XMV& X, const BV& beta, - const YMV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XMV& X, + const BV& beta, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " "X is not a Kokkos::View."); @@ -241,10 +250,10 @@ struct Axpby::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } else { typedef typename XMV::size_type index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } Kokkos::Profiling::popRegion(); } @@ -320,17 +329,18 @@ struct Axpby -struct Axpby +struct Axpby { typedef typename XV::non_const_value_type AV; typedef typename YV::non_const_value_type BV; typedef typename YV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATB; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; - static void axpby(const AV& alpha, const XV& X, const BV& beta, const YV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XV& X, + const BV& beta, const YV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); @@ -342,10 +352,10 @@ struct Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YV::Rank == (int)XV::Rank, + static_assert((int)YV::rank == (int)XV::rank, "KokkosBlas::Impl::" "Axpby::axpby: X and Y must have the same rank."); - static_assert(YV::Rank == 1, + static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby::axpby: " "X and Y must have rank 1."); @@ -394,14 +404,14 @@ struct Axpby(INT_MAX)) { typedef int index_type; - Axpby_Generic( - alpha, X, beta, Y, 0, a, b); + space, alpha, X, beta, Y, 0, a, b); } else { typedef typename XV::size_type index_type; - Axpby_Generic( - alpha, X, beta, Y, 0, a, b); + space, alpha, X, beta, Y, 0, a, b); } Kokkos::Profiling::popRegion(); } @@ -422,7 +432,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -433,7 +443,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -453,7 +463,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -462,6 +472,7 @@ struct Axpby >, \ 2, false, true>; \ extern template struct Axpby< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -478,7 +489,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -487,6 +498,7 @@ struct Axpby >, \ 2, false, true>; \ template struct Axpby< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -501,7 +513,5 @@ struct Axpby; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_AXPBY_HPP_ diff --git a/blas/impl/KokkosBlas1_dot_impl.hpp b/blas/impl/KokkosBlas1_dot_impl.hpp index cfcd0020ef..2003f7cc2c 100644 --- a/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_impl.hpp @@ -30,9 +30,9 @@ namespace Impl { /// \tparam YVector Type of the second vector y; 1-D View /// \tparam SizeType Type of the row index used in the dot product. /// For best performance, use int instead of size_t here. -template +template struct DotFunctor { - typedef typename XVector::execution_space execution_space; typedef SizeType size_type; typedef typename AV::non_const_value_type avalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; @@ -43,8 +43,9 @@ struct DotFunctor { DotFunctor(const XVector& x, const YVector& y) : m_x(x), m_y(y) {} - void run(const char* label, AV result) { - Kokkos::RangePolicy policy(0, m_x.extent(0)); + void run(const char* label, const execution_space& space, AV result) { + Kokkos::RangePolicy policy(space, 0, + m_x.extent(0)); Kokkos::parallel_reduce(label, policy, *this, result); } @@ -56,7 +57,7 @@ struct DotFunctor { } KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = Kokkos::Details::ArithTraits::zero(); + update = Kokkos::ArithTraits::zero(); } KOKKOS_INLINE_FUNCTION void join(value_type& update, diff --git a/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/blas/impl/KokkosBlas1_dot_mv_impl.hpp index 9dda766b03..d19e512599 100644 --- a/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -72,14 +72,13 @@ struct Dot_MV_Functor { // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Dot_Invoke( - const RV& r, const XV& x, const YV& y, + const execution_space& space, const RV& r, const XV& x, const YV& y, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; - size_type numDots = std::max(x.extent(1), y.extent(1)); + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { + size_type numDots = std::max(x.extent(1), y.extent(1)); if (x.extent(0) != y.extent(0)) { std::ostringstream oss; oss << "KokkosBlas::dot (rank-2): x and y have different lengths (" @@ -103,14 +102,13 @@ void MV_Dot_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerDot; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), numDots, teamsPerDot); size_type numTeams = numDots * teamsPerDot; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Dot_MV", pol, Dot_MV_Functor( r, x, y, teamsPerDot)); @@ -118,18 +116,20 @@ void MV_Dot_Invoke( // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Dot_Invoke( - const RV& r, const XV& x, const YV& y, + const execution_space& space, const RV& r, const XV& x, const YV& y, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); - MV_Dot_Invoke(tempResult, x, y); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Dot_Invoke( + space, tempResult, x, y); + Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 430f357a36..02efee6bc5 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -61,8 +61,8 @@ struct HasSpecialAccumulator { }; // Specialization struct which defines whether a specialization exists -template +template struct dot_eti_spec_avail { enum : bool { value = false }; }; @@ -78,6 +78,7 @@ struct dot_eti_spec_avail { #define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View>, \ Kokkos::View \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -129,6 +132,7 @@ struct dot_eti_spec_avail { }; \ template <> \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -144,6 +148,7 @@ struct dot_eti_spec_avail { }; \ template <> \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -167,17 +172,21 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = dot_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + dot_eti_spec_avail::value> struct Dot { - static void dot(const RV&, const XV& R, const YV& X); + static void dot(const execution_space& space, const RV&, const XV& R, + const YV& X); }; // This version never has TPL support, but it does use the same ETI system -template ::value> +template ::value> struct DotSpecialAccumulator { // Note: not doing the static_asserts to validate RV, XV, YV since those // errors would have already arisen when building the library. @@ -191,15 +200,17 @@ struct DotSpecialAccumulator { typename RV::device_type, Kokkos::MemoryTraits>; - static void dot(const RV_Result& R, const XV& X, const YV& Y); + static void dot(const execution_space& space, const RV_Result& R, const XV& X, + const YV& Y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Dot for single vectors (1-D Views). // The rank-1 case is currently the only one that may use a different // accumulator type than InnerProductSpaceTraits::dot_type. -template -struct Dot { +template +struct Dot { // Check some things about the template parameters at compile time to get nice // error messages, before using them under the assumption they are valid. static_assert(Kokkos::is_view::value, @@ -237,7 +248,8 @@ struct Dot { Kokkos::MemoryTraits> RV_Result; - static void dot(const RV& R, const XV& X, const YV& Y) { + static void dot(const execution_space& space, const RV& R, const XV& X, + const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" : "KokkosBlas::dot[noETI]"); @@ -254,12 +266,12 @@ struct Dot { if (numElems < static_cast(INT_MAX)) { typedef int index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } else { typedef int64_t index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } Kokkos::Profiling::popRegion(); } @@ -269,15 +281,16 @@ struct Dot { // uses DotAccumulatingScalar for the result view. // // Is never supported by TPLs, but uses the same dot_eti_spec_avail::value. -template -struct DotSpecialAccumulator { +template +struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: XV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: YV is not a Kokkos::View."); - static_assert(XV::rank == YV::rank, + static_assert(static_cast(XV::rank) == static_cast(YV::rank), "KokkosBlas::Impl::" "DotSpecialAccumulator: X and Y have different ranks."); static_assert(XV::rank == 1, @@ -306,7 +319,8 @@ struct DotSpecialAccumulator { typename RV::device_type, Kokkos::MemoryTraits>; - static void dot(const RV_Result& R, const XV& X, const YV& Y) { + static void dot(const execution_space& space, const RV_Result& R, const XV& X, + const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" : "KokkosBlas::dot[noETI]"); @@ -323,19 +337,20 @@ struct DotSpecialAccumulator { if (numElems < static_cast(INT_MAX)) { typedef int index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } else { typedef int64_t index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } Kokkos::Profiling::popRegion(); } }; -template -struct Dot +struct Dot { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -363,7 +378,8 @@ struct Dot(INT_MAX)) { typedef int index_type; - DotFunctor f(X0, - Y0); - f.run("KokkosBlas::dot<1D>", R0); + DotFunctor + f(X0, Y0); + f.run("KokkosBlas::dot<1D>", space, R0); } else { typedef int64_t index_type; - DotFunctor f(X0, - Y0); - f.run("KokkosBlas::dot<1D>", R0); + DotFunctor + f(X0, Y0); + f.run("KokkosBlas::dot<1D>", space, R0); } } else { if (numRows < static_cast(INT_MAX) && numRows * numDots < static_cast(INT_MAX)) { typedef int index_type; - MV_Dot_Invoke(R, X, Y); + MV_Dot_Invoke(space, R, X, Y); } else { typedef std::int64_t index_type; - MV_Dot_Invoke(R, X, Y); + MV_Dot_Invoke(space, R, X, Y); } } Kokkos::Profiling::popRegion(); @@ -421,6 +439,7 @@ struct Dot>, \ Kokkos::View>, \ 1, 1, false, true>; \ extern template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ 1, 1, false, true>; \ extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ true>; \ extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View>, \ Kokkos::View; #define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Dot>, \ Kokkos::View, \ @@ -472,6 +495,7 @@ struct Dot>, \ 1, 1, false, true>; \ template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ 1, 1, false, true>; \ template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View>, \ Kokkos::View>, \ true>; \ template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ @@ -523,6 +550,7 @@ struct Dot>, \ 2, 2, false, true>; \ extern template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -535,6 +563,7 @@ struct Dot>, \ 2, 1, false, true>; \ extern template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -550,6 +579,7 @@ struct Dot, \ @@ -562,6 +592,7 @@ struct Dot>, \ 2, 2, false, true>; \ template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -574,6 +605,7 @@ struct Dot>, \ 2, 1, false, true>; \ template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -587,7 +619,5 @@ struct Dot; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/blas/impl/KokkosBlas1_iamax_impl.hpp b/blas/impl/KokkosBlas1_iamax_impl.hpp index 855a503422..4c7a3fcc0c 100644 --- a/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -79,21 +79,20 @@ struct V_Iamax_Functor { /// \brief Find the index of the element with the maximum magnitude of the /// single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Iamax_Invoke(const RV& r, const XV& X) { - using execution_space = typename XV::execution_space; - using AT = Kokkos::Details::ArithTraits; +template +void V_Iamax_Invoke(const execution_space& space, const RV& r, const XV& X) { + using AT = Kokkos::ArithTraits; using mag_type = typename AT::mag_type; const SizeType numRows = static_cast(X.extent(0)); // Avoid MaxLoc Reduction if this is a zero length view if (numRows == 0) { - Kokkos::deep_copy(r, 0); + Kokkos::deep_copy(space, r, 0); return; } - Kokkos::RangePolicy policy(1, numRows + 1); + Kokkos::RangePolicy policy(space, 1, numRows + 1); using functor_type = V_Iamax_Functor; functor_type op(X); @@ -103,12 +102,13 @@ void V_Iamax_Invoke(const RV& r, const XV& X) { /// \brief Find the index of the element with the maximum magnitude of the /// columns of the /// multivector (2-D View) X, and store result(s) in the 1-D View r. -template -void MV_Iamax_Invoke(const RV& r, const XMV& X) { +template +void MV_Iamax_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_Iamax_Invoke(ri, Xi); + V_Iamax_Invoke( + space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_iamax_spec.hpp b/blas/impl/KokkosBlas1_iamax_spec.hpp index 57d0056e92..341b949050 100644 --- a/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct iamax_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct iamax_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = iamax_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + iamax_eti_spec_avail::value> struct Iamax { - static void iamax(const RMV& R, const XMV& X); + static void iamax(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Iamax for single vectors (1-D Views). -template -struct Iamax { +template +struct Iamax { typedef typename XMV::size_type size_type; - static void iamax(const RMV& R, const XMV& X) { + static void iamax(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Iamax<1-D>: RMV is not a Kokkos::View."); @@ -164,20 +171,21 @@ struct Iamax { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Iamax_Invoke(R, X); + V_Iamax_Invoke(space, R, X); } else { typedef std::int64_t index_type; - V_Iamax_Invoke(R, X); + V_Iamax_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Iamax { +template +struct Iamax { typedef typename XMV::size_type size_type; - static void iamax(const RV& R, const XMV& X) { + static void iamax(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Iamax<2-D>: RV is not a Kokkos::View."); @@ -207,10 +215,10 @@ struct Iamax { const size_type numCols = X.extent(1); if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Iamax_Invoke(R, X); + MV_Iamax_Invoke(space, R, X); } else { typedef std::int64_t index_type; - MV_Iamax_Invoke(R, X); + MV_Iamax_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -230,6 +238,7 @@ struct Iamax { #define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 1, false, true>; \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 1, false, true>; \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 2, false, true>; \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 2, false, true>; \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { MEM_SPACE) #include -#include -#include #endif // KOKKOSBLAS1_IAMAX_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_mult_impl.hpp b/blas/impl/KokkosBlas1_mult_impl.hpp index 988b5cf534..048db395b0 100644 --- a/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/blas/impl/KokkosBlas1_mult_impl.hpp @@ -37,9 +37,8 @@ namespace Impl { template struct MV_MultFunctor { - typedef typename CMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type m_n; typename CMV::const_value_type m_c; @@ -105,9 +104,8 @@ struct MV_MultFunctor { template struct V_MultFunctor { - typedef typename CV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; typename CV::const_value_type m_c; CV m_C; @@ -146,18 +144,18 @@ struct V_MultFunctor { /// /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual BLAS /// update rules. -template -void V_Mult_Generic(typename CV::const_value_type& c, const CV& C, +template +void V_Mult_Generic(const execution_space& space, + typename CV::const_value_type& c, const CV& C, typename AV::const_value_type& ab, const AV& A, const BV& B) { using Kokkos::ALL; using Kokkos::subview; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATC; - typedef typename CV::execution_space execution_space; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATC; const SizeType numRows = C.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (c == ATC::zero()) { if (ab == ATA::zero()) { @@ -193,13 +191,13 @@ void V_Mult_Generic(typename CV::const_value_type& c, const CV& C, /// /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. -template -void MV_Mult_Generic(typename CMV::const_value_type& c, const CMV& C, +template +void MV_Mult_Generic(const execution_space& space, + typename CMV::const_value_type& c, const CMV& C, typename AV::const_value_type& ab, const AV& A, const BMV& B) { - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATC; - typedef typename CMV::execution_space execution_space; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATC; if (C.extent(1) == 1) { auto C_0 = Kokkos::subview(C, Kokkos::ALL(), 0); @@ -207,12 +205,13 @@ void MV_Mult_Generic(typename CMV::const_value_type& c, const CMV& C, typedef decltype(C_0) CV; typedef decltype(B_0) BV; - V_Mult_Generic(c, C_0, ab, A, B_0); + V_Mult_Generic(space, c, C_0, ab, A, + B_0); return; } const SizeType numRows = C.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (c == ATC::zero()) { if (ab == ATA::zero()) { diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index 1c0a88e8dc..c81e00a6b0 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -27,7 +27,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +45,7 @@ struct mult_eti_spec_avail { #define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct mult_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct mult_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = mult_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + mult_eti_spec_avail::value> struct Mult { - static void mult(const typename YMV::non_const_value_type& gamma, + static void mult(const execution_space& space, + const typename YMV::non_const_value_type& gamma, const YMV& Y, const typename XMV::non_const_value_type& alpha, const AV& A, const XMV& X); @@ -110,14 +117,16 @@ struct Mult { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for YMV, AV, and XMV rank-2 Views. -template -struct Mult { +template +struct Mult { typedef typename YMV::size_type size_type; typedef typename YMV::non_const_value_type YMV_scalar; typedef typename XMV::non_const_value_type XMV_scalar; - static void mult(const YMV_scalar& gamma, const YMV& Y, - const XMV_scalar& alpha, const AV& A, const XMV& X) { + static void mult(const execution_space& space, const YMV_scalar& gamma, + const YMV& Y, const XMV_scalar& alpha, const AV& A, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Mult::mult: Y is not a Kokkos::View."); @@ -160,23 +169,27 @@ struct Mult { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Mult_Generic(gamma, Y, alpha, A, X); + MV_Mult_Generic(space, gamma, Y, + alpha, A, X); } else { - MV_Mult_Generic(gamma, Y, alpha, A, X); + MV_Mult_Generic(space, gamma, Y, + alpha, A, X); } Kokkos::Profiling::popRegion(); } }; // Partial specialization for YV, AV, and XV rank-1 Views. -template -struct Mult { +template +struct Mult { typedef typename YV::size_type size_type; typedef typename YV::non_const_value_type YV_scalar; typedef typename XV::non_const_value_type XV_scalar; - static void mult(const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha, - const AV& A, const XV& X) { + static void mult(const execution_space& space, const YV_scalar& gamma, + const YV& Y, const XV_scalar& alpha, const AV& A, + const XV& X) { // YV, AV, and XV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -212,9 +225,11 @@ struct Mult { const size_type numRows = Y.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Mult_Generic(gamma, Y, alpha, A, X); + V_Mult_Generic(space, gamma, Y, alpha, + A, X); } else { - V_Mult_Generic(gamma, Y, alpha, A, X); + V_Mult_Generic(space, gamma, Y, + alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -235,6 +250,7 @@ struct Mult { #define KOKKOSBLAS1_MULT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_MULT_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm1_impl.hpp b/blas/impl/KokkosBlas1_nrm1_impl.hpp index 433ce580df..a88c01023e 100644 --- a/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -35,12 +35,11 @@ namespace Impl { /// \tparam SizeType Index type. Use int (32 bits) if possible. template struct V_Nrm1_Functor { - typedef typename XV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::ArithTraits XAT; - typedef typename XAT::mag_type value_type; - typedef Kokkos::ArithTraits MAT; + using size_type = SizeType; + using xvalue_type = typename XV::non_const_value_type; + using XAT = Kokkos::ArithTraits; + using value_type = typename XAT::mag_type; + using MAT = Kokkos::ArithTraits; typename XV::const_type m_x; @@ -70,11 +69,11 @@ struct V_Nrm1_Functor { template struct Nrm1_MV_Functor { - typedef typename RV::non_const_value_type rvalue_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::ArithTraits XAT; - typedef typename XAT::mag_type value_type; - typedef Kokkos::ArithTraits MAT; + using rvalue_type = typename RV::non_const_value_type; + using xvalue_type = typename XV::non_const_value_type; + using XAT = Kokkos::ArithTraits; + using value_type = typename XAT::mag_type; + using MAT = Kokkos::ArithTraits; using TeamMem = typename Kokkos::TeamPolicy::member_type; @@ -112,11 +111,10 @@ struct Nrm1_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Nrm1_Invoke(const RV& r, const XV& X) { - typedef typename XV::execution_space execution_space; +template +void V_Nrm1_Invoke(const execution_space& space, const RV& r, const XV& X) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Nrm1_Functor functor_type; functor_type op(X); @@ -127,13 +125,12 @@ void V_Nrm1_Invoke(const RV& r, const XV& X) { /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Nrm1_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" @@ -142,14 +139,13 @@ void MV_Nrm1_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for( "KokkosBlas1::Nrm1::S1", pol, Nrm1_MV_Functor(r, x, teamsPerVec)); @@ -157,18 +153,23 @@ void MV_Nrm1_Invoke( // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Nrm1_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); - MV_Nrm1_Invoke(tempResult, x); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Nrm1_Invoke( + space, tempResult, x); + Kokkos::deep_copy(space, r, tempResult); + // Fence needed to ensure that the deep_copy + // above finishes before we exit this function + // and tempResult runs out of scope... + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index dbaface96e..24f093c736 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm1_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct nrm1_eti_spec_avail { #define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct nrm1_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -68,6 +69,7 @@ struct nrm1_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm1_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -90,20 +92,22 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm1_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = XMV::rank, + bool tpl_spec_avail = nrm1_tpl_spec_avail::value, + bool eti_spec_avail = nrm1_eti_spec_avail::value> struct Nrm1 { - static void nrm1(const RMV& R, const XMV& X); + static void nrm1(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm1 for single vectors (1-D Views). -template -struct Nrm1 { - typedef typename XMV::size_type size_type; +template +struct Nrm1 { + using size_type = typename XMV::size_type; - static void nrm1(const RMV& R, const XMV& X) { + static void nrm1(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm1<1-D>: RMV is not a Kokkos::View."); @@ -131,20 +135,21 @@ struct Nrm1 { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Nrm1_Invoke(R, X); + V_Nrm1_Invoke(space, R, X); } else { - typedef std::int64_t index_type; - V_Nrm1_Invoke(R, X); + using index_type = std::int64_t; + V_Nrm1_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Nrm1 { - typedef typename XMV::size_type size_type; +template +struct Nrm1 { + using size_type = typename XMV::size_type; - static void nrm1(const RV& R, const XMV& X) { + static void nrm1(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm1<2-D>: RV is not a Kokkos::View."); @@ -176,18 +181,20 @@ struct Nrm1 { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm1_Invoke(R0, X0); + V_Nrm1_Invoke(space, + R0, X0); } else { typedef std::int64_t index_type; - V_Nrm1_Invoke(R0, X0); + V_Nrm1_Invoke( + space, R0, X0); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm1_Invoke(R, X); + MV_Nrm1_Invoke(space, R, X); } else { - typedef std::int64_t index_type; - MV_Nrm1_Invoke(R, X); + using index_type = std::int64_t; + MV_Nrm1_Invoke(space, R, X); } } Kokkos::Profiling::popRegion(); @@ -207,6 +214,7 @@ struct Nrm1 { // #define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -223,6 +231,7 @@ struct Nrm1 { // #define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -242,6 +251,7 @@ struct Nrm1 { #define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -261,6 +271,7 @@ struct Nrm1 { #define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -273,7 +284,5 @@ struct Nrm1 { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_NRM1_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm2_impl.hpp b/blas/impl/KokkosBlas1_nrm2_impl.hpp index 32f4660f18..276023c171 100644 --- a/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -35,11 +35,10 @@ namespace Impl { /// \tparam SizeType Index type. Use int (32 bits) if possible. template struct V_Nrm2_Functor { - typedef typename XV::execution_space execution_space; typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; typename XV::const_type m_x; @@ -80,8 +79,7 @@ struct V_Nrm2_Functor { KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = - Kokkos::Details::ArithTraits::sqrt( - update); + Kokkos::ArithTraits::sqrt(update); } }; @@ -96,7 +94,7 @@ struct Nrm2_MV_Functor { typedef typename RV::non_const_value_type rvalue_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; using TeamMem = typename Kokkos::TeamPolicy::member_type; @@ -137,13 +135,13 @@ struct Nrm2_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Nrm2_Invoke(const RV& r, const XV& X, const bool& take_sqrt) { - typedef typename XV::execution_space execution_space; +template +void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, + const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); - typedef V_Nrm2_Functor functor_type; + using functor_type = V_Nrm2_Functor; functor_type op(X, take_sqrt); Kokkos::parallel_reduce("KokkosBlas::Nrm2::S0", policy, op, r); } @@ -152,13 +150,12 @@ void V_Nrm2_Invoke(const RV& r, const XV& X, const bool& take_sqrt) { /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Nrm2_Invoke( - const RV& r, const XV& x, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" @@ -167,38 +164,40 @@ void MV_Nrm2_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for( "KokkosBlas1::Nrm2::S1", pol, Nrm2_MV_Functor(r, x, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for("KokkosBlas1::Nrm2::Sqrt", - Kokkos::RangePolicy(0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for( + "KokkosBlas1::Nrm2::Sqrt", + Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Nrm2_Invoke( - const RV& r, const XV& x, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); - MV_Nrm2_Invoke(tempResult, x, take_sqrt); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Nrm2_Invoke( + space, tempResult, x, take_sqrt); + Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index a8fd6eee5d..6c21e551a8 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct nrm2_eti_spec_avail { #define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct nrm2_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -68,6 +69,7 @@ struct nrm2_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm2_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -90,20 +92,24 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm2_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = XMV::rank, + bool tpl_spec_avail = nrm2_tpl_spec_avail::value, + bool eti_spec_avail = nrm2_eti_spec_avail::value> struct Nrm2 { - static void nrm2(const RMV& R, const XMV& X, const bool& take_sqrt); + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, + const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2 for single vectors (1-D Views). -template -struct Nrm2 { +template +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const RMV& R, const XMV& X, const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, + const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<1-D>: RMV is not a Kokkos::View."); @@ -131,20 +137,23 @@ struct Nrm2 { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2_Invoke(R, X, take_sqrt); + V_Nrm2_Invoke(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(R, X, take_sqrt); + V_Nrm2_Invoke(space, R, X, + take_sqrt); } Kokkos::Profiling::popRegion(); } }; -template -struct Nrm2 { +template +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const RV& R, const XMV& X, const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RV& R, const XMV& X, + const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<2-D>: RV is not a Kokkos::View."); @@ -176,19 +185,21 @@ struct Nrm2 { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2_Invoke(R0, X0, take_sqrt); + V_Nrm2_Invoke( + space, R0, X0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(R0, X0, - take_sqrt); + V_Nrm2_Invoke( + space, R0, X0, take_sqrt); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2_Invoke(R, X, take_sqrt); + MV_Nrm2_Invoke(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2_Invoke(R, X, take_sqrt); + MV_Nrm2_Invoke(space, R, X, + take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -208,6 +219,7 @@ struct Nrm2 { // #define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -224,6 +236,7 @@ struct Nrm2 { // #define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -243,6 +256,7 @@ struct Nrm2 { #define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -262,6 +276,7 @@ struct Nrm2 { #define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -274,7 +289,5 @@ struct Nrm2 { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_NRM2_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 69667bf838..fb9b1f7858 100644 --- a/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -40,7 +40,7 @@ struct V_Nrm2w_Functor { typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; typename XV::const_type m_x, m_w; @@ -83,8 +83,7 @@ struct V_Nrm2w_Functor { KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = - Kokkos::Details::ArithTraits::sqrt( - update); + Kokkos::ArithTraits::sqrt(update); } }; @@ -93,7 +92,7 @@ struct Nrm2w_MV_Functor { typedef typename RV::non_const_value_type rvalue_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; using TeamMem = typename Kokkos::TeamPolicy::member_type; @@ -135,12 +134,11 @@ struct Nrm2w_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Nrm2w_Invoke(const RV& r, const XV& X, const XV& W, - const bool& take_sqrt) { - typedef typename XV::execution_space execution_space; +template +void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, + const XV& W, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Nrm2w_Functor functor_type; functor_type op(X, W, take_sqrt); @@ -151,13 +149,13 @@ void V_Nrm2w_Invoke(const RV& r, const XV& X, const XV& W, /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Nrm2w_Invoke( - const RV& r, const XV& x, const XV& w, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, const XV& w, + bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; + execution_space, typename XV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" @@ -166,39 +164,41 @@ void MV_Nrm2w_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("KokkosBlas1::Nrm2w::S1", pol, Nrm2w_MV_Functor( r, x, w, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for("KokkosBlas1::Nrm2w::Sqrt", - Kokkos::RangePolicy(0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for( + "KokkosBlas1::Nrm2w::Sqrt", + Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Nrm2w_Invoke( - const RV& r, const XV& x, const XV& w, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, const XV& w, + bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename XV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), r.extent(0)); - MV_Nrm2w_Invoke(tempResult, x, w, + MV_Nrm2w_Invoke(space, tempResult, x, w, take_sqrt); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index b25199c1f5..f4bbe286ef 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2w_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct nrm2w_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm2w_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -68,6 +69,7 @@ struct nrm2w_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm2w_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, Kokkos::Device, \ @@ -88,22 +90,25 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm2w_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + nrm2w_eti_spec_avail::value> struct Nrm2w { - static void nrm2w(const RMV& R, const XMV& X, const XMV& W, - const bool& take_sqrt); + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, + const XMV& W, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2w for single vectors (1-D Views). -template -struct Nrm2w { - typedef typename XMV::size_type size_type; +template +struct Nrm2w { + using size_type = typename XMV::size_type; - static void nrm2w(const RMV& R, const XMV& X, const XMV& W, - const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, + const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<1-D>: RMV is not a Kokkos::View."); @@ -131,21 +136,23 @@ struct Nrm2w { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2w_Invoke(R, X, W, take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(R, X, W, take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, + take_sqrt); } Kokkos::Profiling::popRegion(); } }; -template -struct Nrm2w { - typedef typename XMV::size_type size_type; +template +struct Nrm2w { + using size_type = typename XMV::size_type; - static void nrm2w(const RV& R, const XMV& X, const XMV& W, - const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RV& R, const XMV& X, + const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<2-D>: RV is not a Kokkos::View."); @@ -178,19 +185,22 @@ struct Nrm2w { auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); auto W0 = Kokkos::subview(W, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2w_Invoke(R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke( + space, R0, X0, W0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(R0, X0, W0, - take_sqrt); + V_Nrm2w_Invoke( + space, R0, X0, W0, take_sqrt); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, + take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, + take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -226,6 +236,7 @@ struct Nrm2w { // #define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Nrm2w< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -245,6 +256,7 @@ struct Nrm2w { #define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Nrm2w< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, Kokkos::Device, \ @@ -262,6 +274,7 @@ struct Nrm2w { #define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Nrm2w< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, Kokkos::Device, \ @@ -272,7 +285,5 @@ struct Nrm2w { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_NRM2W_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrminf_impl.hpp b/blas/impl/KokkosBlas1_nrminf_impl.hpp index c42aff8ba2..b8431ac8ea 100644 --- a/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -38,7 +38,7 @@ struct V_NrmInf_Functor { typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; typename XV::const_type m_x; @@ -69,10 +69,9 @@ struct V_NrmInf_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_NrmInf_Invoke(const RV& r, const XV& X) { - typedef typename XV::execution_space execution_space; - typedef Kokkos::Details::ArithTraits AT; +template +void V_NrmInf_Invoke(const execution_space& space, const RV& r, const XV& X) { + typedef Kokkos::ArithTraits AT; const SizeType numRows = static_cast(X.extent(0)); @@ -82,7 +81,7 @@ void V_NrmInf_Invoke(const RV& r, const XV& X) { return; } - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_NrmInf_Functor functor_type; functor_type op(X); @@ -92,12 +91,13 @@ void V_NrmInf_Invoke(const RV& r, const XV& X) { /// \brief Compute the 2-norms (or their square) of the columns of the /// multivector (2-D View) X, and store result(s) in the 1-D View r. -template -void MV_NrmInf_Invoke(const RV& r, const XMV& X) { +template +void MV_NrmInf_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_NrmInf_Invoke(ri, Xi); + V_NrmInf_Invoke( + space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index 69bc0eeb47..3659d61f19 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrminf_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct nrminf_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrminf_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -69,6 +70,7 @@ struct nrminf_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrminf_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -91,20 +93,23 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrminf_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + nrminf_eti_spec_avail::value> struct NrmInf { - static void nrminf(const RMV& R, const XMV& X); + static void nrminf(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of NrmInf for single vectors (1-D Views). -template -struct NrmInf { +template +struct NrmInf { typedef typename XMV::size_type size_type; - static void nrminf(const RMV& R, const XMV& X) { + static void nrminf(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "NrmInf<1-D>: RMV is not a Kokkos::View."); @@ -132,20 +137,21 @@ struct NrmInf { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_NrmInf_Invoke(R, X); + V_NrmInf_Invoke(space, R, X); } else { typedef std::int64_t index_type; - V_NrmInf_Invoke(R, X); + V_NrmInf_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct NrmInf { +template +struct NrmInf { typedef typename XMV::size_type size_type; - static void nrminf(const RV& R, const XMV& X) { + static void nrminf(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "NrmInf<2-D>: RV is not a Kokkos::View."); @@ -175,10 +181,10 @@ struct NrmInf { const size_type numCols = X.extent(1); if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_NrmInf_Invoke(R, X); + MV_NrmInf_Invoke(space, R, X); } else { typedef std::int64_t index_type; - MV_NrmInf_Invoke(R, X); + MV_NrmInf_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -198,6 +204,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -215,6 +222,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -234,6 +242,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -253,6 +262,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -265,7 +275,5 @@ struct NrmInf { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_NRMINF_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 1db1b9100d..21f736ac4f 100644 --- a/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -30,9 +30,8 @@ namespace Impl { // Entry-wise reciprocalolute value / magnitude: R(i,j) = reciprocal(X(i,j)). template struct MV_Reciprocal_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -69,9 +68,8 @@ struct MV_Reciprocal_Functor { // reciprocal(R(i,j)). template struct MV_ReciprocalSelf_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -100,9 +98,8 @@ struct MV_ReciprocalSelf_Functor { // reciprocal(X(i)). template struct V_Reciprocal_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; XV X_; @@ -130,9 +127,8 @@ struct V_Reciprocal_Functor { // reciprocal(R(i)). template struct V_ReciprocalSelf_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; @@ -151,8 +147,9 @@ struct V_ReciprocalSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise reciprocalolute value. -template -void MV_Reciprocal_Generic(const RMV& R, const XMV& X) { +template +void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Generic: RMV is not a Kokkos::View."); @@ -166,9 +163,8 @@ void MV_Reciprocal_Generic(const RMV& R, const XMV& X) { "KokkosBlas::Impl::" "MV_Reciprocal_Generic: XMV is not rank 2"); - typedef typename XMV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (R == X) { // if R and X are the same (alias one another) MV_ReciprocalSelf_Functor op(R); @@ -180,8 +176,9 @@ void MV_Reciprocal_Generic(const RMV& R, const XMV& X) { } // Variant of MV_Reciprocal_Generic for single vectors (1-D Views) R and X. -template -void V_Reciprocal_Generic(const RV& R, const XV& X) { +template +void V_Reciprocal_Generic(const execution_space& space, const RV& R, + const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Reciprocal_Generic: RV is not a Kokkos::View."); @@ -195,9 +192,8 @@ void V_Reciprocal_Generic(const RV& R, const XV& X) { "KokkosBlas::Impl::" "V_Reciprocal_Generic: XV is not rank 1"); - typedef typename XV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (R == X) { // if R and X are the same (alias one another) V_ReciprocalSelf_Functor op(R); diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 1a40aa3542..08fc8bc341 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct reciprocal_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct reciprocal_eti_spec_avail { MEM_SPACE) \ template <> \ struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = reciprocal_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + reciprocal_eti_spec_avail::value> struct Reciprocal { - static void reciprocal(const RMV& R, const XMV& X); + static void reciprocal(const execution_space& space, const RMV& R, + const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Reciprocal for single vectors (1-D Views). -template -struct Reciprocal { +template +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const RMV& R, const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<1-D>: RMV is not a Kokkos::View."); @@ -127,20 +134,22 @@ struct Reciprocal { if (numRows < static_cast(INT_MAX)) { typedef int index_type; - V_Reciprocal_Generic(R, X); + V_Reciprocal_Generic(space, R, X); } else { typedef std::int64_t index_type; - V_Reciprocal_Generic(R, X); + V_Reciprocal_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Reciprocal { +template +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const RMV& R, const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<2-D>: RMV is not a Kokkos::View."); @@ -171,10 +180,10 @@ struct Reciprocal { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Reciprocal_Generic(R, X); + MV_Reciprocal_Generic(space, R, X); } else { typedef std::int64_t index_type; - MV_Reciprocal_Generic(R, X); + MV_Reciprocal_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -194,6 +203,7 @@ struct Reciprocal { #define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { 2, false, true>; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_ diff --git a/blas/impl/KokkosBlas1_rot_spec.hpp b/blas/impl/KokkosBlas1_rot_spec.hpp index 6547884d46..214e0399e5 100644 --- a/blas/impl/KokkosBlas1_rot_spec.hpp +++ b/blas/impl/KokkosBlas1_rot_spec.hpp @@ -138,6 +138,5 @@ struct Rot; #include -#include #endif // KOKKOSBLAS1_ROT_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotg_spec.hpp b/blas/impl/KokkosBlas1_rotg_spec.hpp index 9b911a28f6..bdf313e3d0 100644 --- a/blas/impl/KokkosBlas1_rotg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotg_spec.hpp @@ -135,6 +135,5 @@ struct Rotg; #include -#include #endif // KOKKOSBLAS1_ROTG_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotm_spec.hpp b/blas/impl/KokkosBlas1_rotm_spec.hpp index 9cc9ae3e61..854f2abacc 100644 --- a/blas/impl/KokkosBlas1_rotm_spec.hpp +++ b/blas/impl/KokkosBlas1_rotm_spec.hpp @@ -133,6 +133,5 @@ struct Rotm; #include -#include #endif // KOKKOSBLAS1_ROTM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotmg_spec.hpp b/blas/impl/KokkosBlas1_rotmg_spec.hpp index b3aeaa1da3..b90a158654 100644 --- a/blas/impl/KokkosBlas1_rotmg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotmg_spec.hpp @@ -141,6 +141,5 @@ struct Rotmg; #include -#include #endif // KOKKOSBLAS1_ROTMG_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_scal_impl.hpp b/blas/impl/KokkosBlas1_scal_impl.hpp index 21974f1f7e..541d9a4934 100644 --- a/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_impl.hpp @@ -44,9 +44,8 @@ namespace Impl { // coefficients in the a vector, if used. template struct V_Scal_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV m_r; XV m_x; @@ -101,9 +100,8 @@ struct V_Scal_Functor { template struct V_Scal_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV m_r; XV m_x; @@ -134,9 +132,9 @@ struct V_Scal_Functor -void V_Scal_Generic(const RV& r, const AV& av, const XV& x, - const SizeType startingColumn, int a = 2) { +template +void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, + const XV& x, const SizeType startingColumn, int a = 2) { static_assert(Kokkos::is_view::value, "V_Scal_Generic: RV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -144,9 +142,8 @@ void V_Scal_Generic(const RV& r, const AV& av, const XV& x, static_assert(RV::rank == 1, "V_Scal_Generic: RV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Generic: XV is not rank 1."); - typedef typename RV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { V_Scal_Functor op(r, x, av, startingColumn); diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index f43101bd10..da4d7a5149 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -45,9 +45,8 @@ namespace Impl { template struct MV_Scal_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -127,9 +126,8 @@ struct MV_Scal_Functor { template struct MV_Scal_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV m_r; @@ -198,9 +196,8 @@ struct MV_Scal_Functor struct MV_Scal_Unroll_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RMV m_r; XMV m_x; @@ -259,9 +256,8 @@ struct MV_Scal_Unroll_Functor { template struct MV_Scal_Unroll_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RMV m_r; XMV m_x; @@ -323,16 +319,16 @@ struct MV_Scal_Unroll_Functor -void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, +template +void MV_Scal_Unrolled(const execution_space& space, const RMV& r, + const aVector& av, const XMV& x, const SizeType startingColumn, int a = 2) { - typedef typename XMV::execution_space execution_space; - if (a == 0) { MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S0", policy, op); return; } @@ -340,7 +336,7 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S1", policy, op); return; } @@ -348,7 +344,7 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S2", policy, op); return; } @@ -357,7 +353,7 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S3", policy, op); } @@ -375,12 +371,13 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Generic(const RVector& r, const aVector& av, const XVector& x, +template +void MV_Scal_Generic(const execution_space& space, const RVector& r, + const aVector& av, const XVector& x, const SizeType startingColumn, int a = 2) { - typedef typename XVector::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { MV_Scal_Functor op(r, x, av, @@ -421,8 +418,9 @@ void MV_Scal_Generic(const RVector& r, const aVector& av, const XVector& x, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { +template +void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, + const AV& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2 @@ -439,7 +437,8 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled(R_cur, av, X_cur, j, a); + MV_Scal_Unrolled( + space, R_cur, av, X_cur, j, a); } for (; j + 4 <= numCols; j += 4) { const std::pair rng(j, j + 4); @@ -448,7 +447,8 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled(R_cur, av, X_cur, j, a); + MV_Scal_Unrolled( + space, R_cur, av, X_cur, j, a); } for (; j < numCols; ++j) { // RMV and XMV need to turn 1-D. @@ -457,7 +457,8 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(r_cur) RV; typedef decltype(x_cur) XV; - V_Scal_Generic(r_cur, av, x_cur, j, a); + V_Scal_Generic(space, r_cur, av, + x_cur, j, a); } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2 @@ -469,39 +470,73 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(r_0) RV; typedef decltype(x_0) XV; - V_Scal_Generic(r_0, av, x_0, 0, a); + V_Scal_Generic(space, r_0, av, x_0, + 0, a); break; } - case 2: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 3: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 4: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 5: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 6: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 7: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 8: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 9: MV_Scal_Unrolled(r, av, x, 0, a); break; + case 2: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 3: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 4: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 5: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 6: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 7: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 8: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 9: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; case 10: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 11: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 12: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 13: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 14: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 15: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 16: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; - default: MV_Scal_Generic(r, av, x, 0, a); + default: + MV_Scal_Generic(space, r, av, x, + 0, a); } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL @@ -521,9 +556,10 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Right(const RMV& r, const aVector& av, const XMV& x, - int a = 2) { +template +void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, + const aVector& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); if (numCols == 1) { @@ -536,9 +572,11 @@ void MV_Scal_Invoke_Right(const RMV& r, const aVector& av, const XMV& x, RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0); XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); - V_Scal_Generic(r_0, av, x_0, a); + V_Scal_Generic(space, r_0, + av, x_0, a); } else { - MV_Scal_Generic(r, av, x, a); + MV_Scal_Generic(space, r, av, + x, a); } } diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 82bf4709b7..38972b2223 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -29,7 +29,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +47,7 @@ struct scal_eti_spec_avail { #define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct scal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -67,6 +69,7 @@ struct scal_eti_spec_avail { MEM_SPACE) \ template <> \ struct scal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct scal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -99,23 +103,28 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = scal_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + scal_eti_spec_avail::value> struct Scal { - static void scal(const RV& R, const AV& A, const XV& X); + static void scal(const execution_space& space, const RV& R, const AV& A, + const XV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Scal for single vectors (1-D Views). -template -struct Scal { +template +struct Scal { typedef typename XV::non_const_value_type AV; typedef typename XV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; + typedef Kokkos::ArithTraits ATA; - static void scal(const RV& R, const AV& alpha, const XV& X) { + static void scal(const execution_space& space, const RV& R, const AV& alpha, + const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<1-D>: RV is not a Kokkos::View."); @@ -154,10 +163,12 @@ struct Scal(INT_MAX)) { typedef int index_type; - V_Scal_Generic(R, alpha, X, a); + V_Scal_Generic(space, R, alpha, + X, a); } else { typedef typename XV::size_type index_type; - V_Scal_Generic(R, alpha, X, a); + V_Scal_Generic(space, R, alpha, + X, a); } Kokkos::Profiling::popRegion(); } @@ -169,12 +180,14 @@ struct Scal -struct Scal { +template +struct Scal { typedef typename XMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; + typedef Kokkos::ArithTraits ATA; - static void scal(const RMV& R, const AV& av, const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& av, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D>: RMV is not a Kokkos::View."); @@ -212,10 +225,12 @@ struct Scal { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(R, av, X, a); + MV_Scal_Invoke_Left(space, R, + av, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(R, av, X, a); + MV_Scal_Invoke_Left(space, R, + av, X, a); } Kokkos::Profiling::popRegion(); } @@ -227,14 +242,15 @@ struct Scal { /// /// 1. R(i,j) = a*X(i,j) for a in -1,0,1 /// 2. R(i,j) = alpha*X(i,j) -template -struct Scal { +template +struct Scal { typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; + typedef Kokkos::ArithTraits ATA; - static void scal(const RMV& R, const AV& alpha, const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& alpha, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View."); @@ -275,12 +291,14 @@ struct Scal(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(R, alpha, X, a); + MV_Scal_Invoke_Left( + space, R, alpha, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(R, alpha, X, a); + MV_Scal_Invoke_Left( + space, R, alpha, X, a); } Kokkos::Profiling::popRegion(); } @@ -299,6 +317,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -309,6 +328,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -326,6 +346,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ Kokkos::View >, \ 2, false, true>; \ extern template struct Scal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -347,6 +369,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ Kokkos::View >, \ 2, false, true>; \ template struct Scal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -366,7 +390,5 @@ struct Scal; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_SCAL_HPP_ diff --git a/blas/impl/KokkosBlas1_sum_impl.hpp b/blas/impl/KokkosBlas1_sum_impl.hpp index 08dc4e36e4..864c983541 100644 --- a/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/blas/impl/KokkosBlas1_sum_impl.hpp @@ -36,11 +36,10 @@ namespace Impl { /// \tparam SizeType Index type. Use int (32 bits) if possible. template struct V_Sum_Functor { - typedef typename XV::execution_space execution_space; typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename RV::non_const_value_type value_type; typename XV::const_type m_x; @@ -104,11 +103,10 @@ struct Sum_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Sum_Invoke(const RV& r, const XV& X) { - typedef typename XV::execution_space execution_space; +template +void V_Sum_Invoke(const execution_space& space, const RV& r, const XV& X) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Sum_Functor functor_type; functor_type op(X); @@ -119,13 +117,12 @@ void V_Sum_Invoke(const RV& r, const XV& X) { /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Sum_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::Sum (rank-2): result vector has wrong length (" @@ -134,14 +131,13 @@ void MV_Sum_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for( "KokkosBlas1::Sum::S1", pol, Sum_MV_Functor(r, x, teamsPerVec)); @@ -149,18 +145,20 @@ void MV_Sum_Invoke( // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Sum_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); - MV_Sum_Invoke(tempResult, x); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Sum_Invoke( + space, tempResult, x); + Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_sum_spec.hpp b/blas/impl/KokkosBlas1_sum_spec.hpp index db1771de8f..458e7ffdb7 100644 --- a/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/blas/impl/KokkosBlas1_sum_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sum_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct sum_eti_spec_avail { #define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct sum_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View \ struct sum_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -86,20 +88,22 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = sum_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = XMV::rank, + bool tpl_spec_avail = sum_tpl_spec_avail::value, + bool eti_spec_avail = sum_eti_spec_avail::value> struct Sum { - static void sum(const RMV& R, const XMV& X); + static void sum(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Sum for single vectors (1-D Views). -template -struct Sum { +template +struct Sum { typedef typename XMV::size_type size_type; - static void sum(const RMV& R, const XMV& X) { + static void sum(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Sum<1-D>: RMV is not a Kokkos::View."); @@ -128,20 +132,21 @@ struct Sum { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Sum_Invoke(R, X); + V_Sum_Invoke(space, R, X); } else { typedef std::int64_t index_type; - V_Sum_Invoke(R, X); + V_Sum_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Sum { +template +struct Sum { typedef typename XMV::size_type size_type; - static void sum(const RV& R, const XMV& X) { + static void sum(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Sum<2-D>: RV is not a Kokkos::View."); @@ -173,18 +178,20 @@ struct Sum { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Sum_Invoke(R0, X0); + V_Sum_Invoke(space, + R0, X0); } else { typedef std::int64_t index_type; - V_Sum_Invoke(R0, X0); + V_Sum_Invoke( + space, R0, X0); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Sum_Invoke(R, X); + MV_Sum_Invoke(space, R, X); } else { typedef std::int64_t index_type; - MV_Sum_Invoke(R, X); + MV_Sum_Invoke(space, R, X); } } Kokkos::Profiling::popRegion(); @@ -204,6 +211,7 @@ struct Sum { // #define KOKKOSBLAS1_SUM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Sum< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { // use this macro in one or more .cpp files in this directory. // #define KOKKOSBLAS1_SUM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Sum >, \ Kokkos::View, \ @@ -234,6 +243,7 @@ struct Sum { #define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Sum< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -251,6 +261,7 @@ struct Sum { #define KOKKOSBLAS1_SUM_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Sum< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -261,7 +272,5 @@ struct Sum { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_SUM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_swap_spec.hpp b/blas/impl/KokkosBlas1_swap_spec.hpp index ed0a14e257..db09a62f8f 100644 --- a/blas/impl/KokkosBlas1_swap_spec.hpp +++ b/blas/impl/KokkosBlas1_swap_spec.hpp @@ -134,6 +134,5 @@ struct Swap; #include -#include #endif // KOKKOSBLAS1_SWAP_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_team_abs_spec.hpp b/blas/impl/KokkosBlas1_team_abs_spec.hpp index 82418fe7d1..bcd9545738 100644 --- a/blas/impl/KokkosBlas1_team_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_team_abs_spec.hpp @@ -35,7 +35,7 @@ struct team_abs_tpl_spec_avail { template ::value> struct TeamAbs { - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X); @@ -43,7 +43,7 @@ struct TeamAbs { template struct TeamAbs { - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X) { diff --git a/blas/impl/KokkosBlas1_team_dot_spec.hpp b/blas/impl/KokkosBlas1_team_dot_spec.hpp index c141694926..041920d109 100644 --- a/blas/impl/KokkosBlas1_team_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_team_dot_spec.hpp @@ -53,7 +53,7 @@ struct TeamDot { static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, const XV& X, const YV& Y) { - dot_type result = 0.0; // Kokkos::Details::ArithTraitszero(); + dot_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, N), diff --git a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp index 4ed19ef5df..ef050cb73b 100644 --- a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp @@ -40,7 +40,7 @@ struct TeamNrm2 { typedef Kokkos::Details::InnerProductSpaceTraits< typename XV::non_const_value_type> IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X); @@ -53,11 +53,11 @@ struct TeamNrm2 { typedef Kokkos::Details::InnerProductSpaceTraits< typename XV::non_const_value_type> IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X) { - mag_type result = 0.0; // Kokkos::Details::ArithTraitszero(); + mag_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, N), diff --git a/blas/impl/KokkosBlas1_update_impl.hpp b/blas/impl/KokkosBlas1_update_impl.hpp index 5866764faf..96aca5c70e 100644 --- a/blas/impl/KokkosBlas1_update_impl.hpp +++ b/blas/impl/KokkosBlas1_update_impl.hpp @@ -43,9 +43,8 @@ namespace Impl { template struct MV_Update_Functor { - typedef typename ZMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; const typename XMV::non_const_value_type alpha_; @@ -213,9 +212,8 @@ struct MV_Update_Functor { template struct V_Update_Functor { - typedef typename ZV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; const typename XV::non_const_value_type alpha_; @@ -316,8 +314,10 @@ struct V_Update_Functor { // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding multivector entry. -template -void MV_Update_Generic(const typename XMV::non_const_value_type& alpha, +template +void MV_Update_Generic(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -347,9 +347,8 @@ void MV_Update_Generic(const typename XMV::non_const_value_type& alpha, "KokkosBlas::Impl::MV_Update_Generic: " "XMV, YMV, and ZMV must have rank 2."); - typedef typename XMV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { if (b == 0) { @@ -417,8 +416,9 @@ void MV_Update_Generic(const typename XMV::non_const_value_type& alpha, // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding vector entry. -template -void V_Update_Generic(const typename XV::non_const_value_type& alpha, +template +void V_Update_Generic(const execution_space& space, + const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, @@ -448,9 +448,8 @@ void V_Update_Generic(const typename XV::non_const_value_type& alpha, "KokkosBlas::Impl::V_Update_Generic: " "XV, YV, and ZV must have rank 1."); - typedef typename XV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { if (b == 0) { diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index 78a6d9aa09..9a54888012 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -27,7 +27,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_eti_spec_avail { enum : bool { value = false }; }; @@ -45,6 +46,7 @@ struct update_eti_spec_avail { MEM_SPACE) \ template <> \ struct update_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -68,6 +70,7 @@ struct update_eti_spec_avail { MEM_SPACE) \ template <> \ struct update_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -100,11 +103,15 @@ namespace Impl { /// Z(i,j) = alpha*X(i,j) + beta*Y(i,j) + gamma*Z(i,j), /// /// with special cases for alpha, beta, or gamma = 0. -template ::value, - bool eti_spec_avail = update_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + update_eti_spec_avail::value> struct Update { - static void update(const typename XMV::non_const_value_type& alpha, + static void update(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -114,14 +121,16 @@ struct Update { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for XMV, YMV, and ZMV rank-2 Views. -template -struct Update { +template +struct Update { typedef typename XMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATB; - typedef Kokkos::Details::ArithTraits ATC; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; + typedef Kokkos::ArithTraits ATC; - static void update(const typename XMV::non_const_value_type& alpha, + static void update(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -194,24 +203,24 @@ struct Update { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic(alpha, X_0, beta, Y_0, gamma, Z_0, a, b, - c); + V_Update_Generic(space, alpha, X_0, beta, + Y_0, gamma, Z_0, a, b, c); } else { typedef typename XMV::size_type index_type; - V_Update_Generic(alpha, X_0, beta, Y_0, gamma, Z_0, a, b, - c); + V_Update_Generic(space, alpha, X_0, beta, + Y_0, gamma, Z_0, a, b, c); } } else { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Update_Generic(alpha, X, beta, Y, gamma, - Z, a, b, c); + MV_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XMV::size_type index_type; - MV_Update_Generic(alpha, X, beta, Y, gamma, - Z, a, b, c); + MV_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } } Kokkos::Profiling::popRegion(); @@ -219,14 +228,16 @@ struct Update { }; // Partial specialization for XV, YV, and ZV rank-1 Views. -template -struct Update { +template +struct Update { typedef typename XV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATB; - typedef Kokkos::Details::ArithTraits ATC; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; + typedef Kokkos::ArithTraits ATC; - static void update(const typename XV::non_const_value_type& alpha, + static void update(const execution_space& space, + const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, const typename ZV::non_const_value_type& gamma, @@ -291,12 +302,12 @@ struct Update { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic(alpha, X, beta, Y, gamma, Z, a, - b, c); + V_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XV::size_type index_type; - V_Update_Generic(alpha, X, beta, Y, gamma, Z, a, - b, c); + V_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } Kokkos::Profiling::popRegion(); } @@ -318,6 +329,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -331,6 +343,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -352,6 +365,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -365,6 +379,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -376,7 +391,5 @@ struct Update { 2, false, true>; #include -#include -#include #endif // KOKKOSBLAS1_UPDATE_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index 7d7403c14b..730f88602a 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -180,7 +180,7 @@ struct SingleLevelTransposeGEMV { KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i, value_type y_cur) const { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; using KAT = ArithTraits; const auto x_i = x_(i); @@ -238,9 +238,9 @@ void singleLevelGemv(const typename AViewType::execution_space& space, // depend on that or its implementation details. Instead, we reuse // an instantiation of the non-transpose case for alpha=0. if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) { - if (beta == Kokkos::Details::ArithTraits::zero()) { - Kokkos::deep_copy(y, Kokkos::Details::ArithTraits::zero()); - } else if (beta != Kokkos::Details::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta != Kokkos::ArithTraits::one()) { // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. @@ -255,12 +255,11 @@ void singleLevelGemv(const typename AViewType::execution_space& space, } if (tr == 'N' || tr == 'n') { - if (alpha == Kokkos::Details::ArithTraits::zero()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (alpha == Kokkos::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, - Kokkos::Details::ArithTraits::zero()); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 using functor_type = @@ -269,14 +268,14 @@ void singleLevelGemv(const typename AViewType::execution_space& space, functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } - } else if (alpha == Kokkos::Details::ArithTraits::one()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelNontransposeGEMV; @@ -290,13 +289,13 @@ void singleLevelGemv(const typename AViewType::execution_space& space, Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } } else { // alpha != 0 and alpha != 1 - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelNontransposeGEMV; @@ -311,12 +310,11 @@ void singleLevelGemv(const typename AViewType::execution_space& space, } } } else if (tr == 'T' || tr == 't') { // transpose, no conjugate - if (alpha == Kokkos::Details::ArithTraits::zero()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (alpha == Kokkos::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, - Kokkos::Details::ArithTraits::zero()); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 using functor_type = @@ -326,15 +324,15 @@ void singleLevelGemv(const typename AViewType::execution_space& space, Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } - } else if (alpha == Kokkos::Details::ArithTraits::one()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -350,14 +348,14 @@ void singleLevelGemv(const typename AViewType::execution_space& space, functor); } } else { // alpha != 0 and alpha != 1 - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -374,12 +372,11 @@ void singleLevelGemv(const typename AViewType::execution_space& space, } } } else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') { // conj xpose - if (alpha == Kokkos::Details::ArithTraits::zero()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (alpha == Kokkos::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, - Kokkos::Details::ArithTraits::zero()); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 using functor_type = @@ -389,15 +386,15 @@ void singleLevelGemv(const typename AViewType::execution_space& space, Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } - } else if (alpha == Kokkos::Details::ArithTraits::one()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -413,14 +410,14 @@ void singleLevelGemv(const typename AViewType::execution_space& space, functor); } } else { // alpha != 0 and alpha != 1 - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -604,7 +601,7 @@ struct TwoLevelTransposeGEMV { public: KOKKOS_INLINE_FUNCTION void operator()(const member_type& team) const { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; using KAT_A = ArithTraits; using KAT_Y = ArithTraits; @@ -668,7 +665,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, using team_policy_type = Kokkos::TeamPolicy; using range_policy_type = Kokkos::RangePolicy; - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; using KAT = ArithTraits; using YKAT = ArithTraits; @@ -746,7 +743,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, } else { if (alpha == KAT::zero() && beta == KAT::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, Kokkos::Details::ArithTraits::zero()); + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); } else if (alpha == KAT::zero() && beta == KAT::one()) { // Do nothing (y := 1 * y) } else if (tr == 'T') { diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index a4582b9d72..42e2465494 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -153,6 +153,5 @@ struct GEMV { false, true>; #include -#include #endif // KOKKOSBLAS1_GEMV_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp new file mode 100644 index 0000000000..fa2220e00a --- /dev/null +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -0,0 +1,256 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_IMPL_HPP_ +#define KOKKOSBLAS2_GER_IMPL_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosBlas { +namespace Impl { + +// Functor for a single-level parallel_for version of nontranspose GER. +// The functor parallelizes over rows of the input matrix A. +template +struct SingleLevelGER { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + SingleLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) + : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); + + if (justTranspose_) { + for (IndexType j = 0; j < N; ++j) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + } else { + for (IndexType j = 0; j < N; ++j) { + A_(i, j) += + AComponentType(alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + } + } + } + + private: + bool justTranspose_; + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Single-level parallel version of GER. +template +void singleLevelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (y.extent(0) == 0) { + // no entries to update + } else if (x.extent(0) == 0) { + // no entries to update + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + SingleLevelGER functor( + (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::ger[SingleLevel]", rangePolicy, functor); + } +} + +struct TwoLevelGER_LayoutLeftTag {}; +struct TwoLevelGER_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for a two-level parallel_reduce version of GER, designed for +// performance on GPU. Kernel depends on the layout of A. +template +struct TwoLevelGER { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TwoLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) + : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutLeftTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType M(A_.extent(0)); + const IndexType j(team.league_rank()); + if (justTranspose_) { + const YComponentType y_fixed(y_(j)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + }); + } else { + const YComponentType y_fixed( + Kokkos::ArithTraits::conj(y_(j))); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + }); + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutRightTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType N(A_.extent(1)); + const IndexType i(team.league_rank()); + const XComponentType x_fixed(x_(i)); + if (justTranspose_) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + }); + } + } + team.team_barrier(); + } + + private: + bool justTranspose_; + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Two-level parallel version of GER. +template +void twoLevelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (y.extent(0) == 0) { + // no entries to update + return; + } else if (x.extent(0) == 0) { + // no entries to update + return; + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + return; + } + + constexpr bool isLayoutLeft = + std::is_same::value; + using layout_tag = + typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TwoLevelGER + functor((trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::ger[twoLevel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalGer: use 1 level (Range) or 2 level (Team) implementation, +// depending on whether execution space is CPU or GPU. +// The 'enable_if' makes sure unused kernels are not instantiated. + +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + singleLevelGer(space, trans, alpha, x, y, A); +} + +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + twoLevelGer(space, trans, alpha, x, y, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_GER_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp new file mode 100644 index 0000000000..9802194b98 --- /dev/null +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -0,0 +1,146 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_SPEC_HPP_ +#define KOKKOSBLAS2_GER_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct ger_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::GER. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// ger +// + +// Implementation of KokkosBlas::ger. +template ::value, + bool eti_spec_avail = ger_eti_spec_avail::value> +struct GER { + static void ger(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::ger[ETI]" + : "KokkosBlas::ger[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + // Prefer int as the index type, but use a larger type if needed. + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + generalGerImpl( + space, trans, alpha, x, y, A); + } else { + generalGerImpl( + space, trans, alpha, x, y, A); + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::GER. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. +// +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include + +#endif // KOKKOSBLAS2_GER_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp index 31975dfa9e..aa7efc9122 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp @@ -31,7 +31,7 @@ struct OpID { struct OpConj { template KOKKOS_INLINE_FUNCTION ValueType operator()(ValueType v) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; return KAT::conj(v); } }; diff --git a/blas/impl/KokkosBlas2_team_gemv_spec.hpp b/blas/impl/KokkosBlas2_team_gemv_spec.hpp index 355c1ca6cf..d46fb7be6f 100644 --- a/blas/impl/KokkosBlas2_team_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_team_gemv_spec.hpp @@ -61,7 +61,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), @@ -76,7 +76,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), @@ -95,7 +95,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), @@ -110,7 +110,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), @@ -129,7 +129,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), @@ -145,7 +145,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), @@ -165,7 +165,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), @@ -184,7 +184,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), @@ -203,7 +203,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 462ac0c744..26c4c9624a 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -43,8 +43,8 @@ struct DotBasedGEMM { using size_A = typename AV::size_type; using scalar_C = typename CV::non_const_value_type; using size_C = typename CV::size_type; - using AVT = Kokkos::Details::ArithTraits; - using CVT = Kokkos::Details::ArithTraits; + using AVT = Kokkos::ArithTraits; + using CVT = Kokkos::ArithTraits; const scalar_A alpha; const scalar_C beta; @@ -68,7 +68,7 @@ struct DotBasedGEMM { numCcols(C.extent(1)), dotSize(A.extent(0)) {} - void run(const typename CV::execution_space& space, bool conjugateTranspose) { + void run(const ExecSpace& space, bool conjugateTranspose) { multipleReductionWorkDistribution( dotSize, numCrows * numCcols, numDivPerDot); const size_C ndots = numCrows * numCcols; // Number of dot products @@ -77,12 +77,12 @@ struct DotBasedGEMM { // Initialize C matrix if beta != 1 if (beta == CVT::zero()) { Kokkos::MDRangePolicy> policyInit( - {0, 0}, {numCrows, numCcols}); + space, {0, 0}, {numCrows, numCcols}); Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } else if (beta != CVT::one()) { Kokkos::MDRangePolicy> policyInit( - {0, 0}, {numCrows, numCcols}); + space, {0, 0}, {numCrows, numCcols}); Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index f1f89b9908..4f3e62f343 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -65,7 +65,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -115,7 +115,7 @@ struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -159,7 +159,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -209,7 +209,7 @@ struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -258,7 +258,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -308,7 +308,7 @@ struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -356,7 +356,7 @@ template struct impl_update_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void update(const TeamHandle& team, const value_type& beta, @@ -417,7 +417,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void update(const TeamHandle& team, const value_type& beta, @@ -611,10 +611,10 @@ struct GEMMImpl { ViewTypeCScratch::shmem_size(); #if defined(KOKKOS_ENABLE_HIP) - // Note lbv, 10/29/20: The LaunchBounds<384,2> leads + // Note lbv, 10/29/20: The LaunchBounds<384, 2> leads // to an error with HIP as the heuristics on that platform // yield an optimal_num_blocks=0 which means no ressources - // are allocated... Switching to LaunchBounds<384,2> fixes + // are allocated... Switching to LaunchBounds<384, 0> fixes // that problem but I'm not sure if that it a good perf // parameter or why it is set to 2 for Cuda? Kokkos::TeamPolicy> policy( diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index 5f443ab371..c340a41fc1 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemm_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct gemm_eti_spec_avail { LAYOUTC, EXEC_SPACE, MEM_SPACE) \ template <> \ struct gemm_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -84,14 +85,15 @@ namespace Impl { // // Implementation of KokkosBlas::gemm. -template ::value, - bool eti_spec_avail = - gemm_eti_spec_avail::value> +template < + class execution_space, class AViewType, class BViewType, class CViewType, + bool tpl_spec_avail = gemm_tpl_spec_avail::value, + bool eti_spec_avail = gemm_eti_spec_avail::value> struct GEMM { - static void gemm(const typename CViewType::execution_space& space, - const char transA[], const char transB[], + static void gemm(const execution_space& space, const char transA[], + const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, @@ -118,14 +120,13 @@ struct GEMM { typedef typename AViewType::non_const_value_type ScalarA; typedef typename BViewType::non_const_value_type ScalarB; typedef typename CViewType::non_const_value_type ScalarC; - typedef typename CViewType::execution_space ExecSpace; // Figure out whether to use DotBased implementation const int M = static_cast(C.extent(0)); const int N = static_cast(C.extent(1)); const bool is_device_space = - KokkosKernels::Impl::kk_is_gpu_exec_space(); + KokkosKernels::Impl::kk_is_gpu_exec_space(); const bool A_is_lr = std::is_same::value; const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || @@ -145,8 +146,8 @@ struct GEMM { // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on // device bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c')); - DotBasedGEMM dotBasedGemm( - alpha, A, B, beta, C); + DotBasedGEMM + dotBasedGemm(alpha, A, B, beta, C); dotBasedGemm.run(space, A_is_conj); } else { @@ -168,15 +169,15 @@ struct GEMM { 24000) ? 4 : 16; - int vector_length = blockB1 / 4; - int max_vector_length = KokkosKernels::Impl::kk_get_max_vector_size< - typename CViewType::execution_space>(); + int vector_length = blockB1 / 4; + int max_vector_length = + KokkosKernels::Impl::kk_get_max_vector_size(); if (vector_length > max_vector_length) vector_length = max_vector_length; // Compute scratch space size - typedef KokkosBlas::Impl::GEMMImpl + typedef KokkosBlas::Impl::GEMMImpl gemm_dummy_type; const int scratch_memory_size = gemm_dummy_type::ViewTypeAScratch::required_allocation_size() + @@ -187,96 +188,83 @@ struct GEMM { // Figure out Team Sizes int team_size = 1; #if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_SYCL) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif // Call the correct kernel if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } @@ -303,6 +291,7 @@ struct GEMM { #define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ LAYOUTC, EXEC_SPACE, MEM_SPACE) \ extern template struct GEMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -316,6 +305,7 @@ struct GEMM { #define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ LAYOUTC, EXEC_SPACE, MEM_SPACE) \ template struct GEMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -355,6 +345,5 @@ struct GEMM { EXEC_SPACE, MEM_SPACE) #include -#include #endif // KOKKOSBLAS3_GEMM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index 50d74b659f..85a8b1c6db 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -26,7 +26,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trmm_eti_spec_avail { enum : bool { value = false }; }; @@ -40,6 +40,7 @@ struct trmm_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct trmm_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -67,21 +68,26 @@ namespace Impl { // // Unification layer -template ::value, - bool eti_spec_avail = trmm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + trmm_eti_spec_avail::value> struct TRMM { - static void trmm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BVIT::const_value_type& alpha, - const AVIT& A, const BVIT& B); + static void trmm(const execution_space& space, const char side[], + const char uplo[], const char trans[], const char diag[], + typename BVIT::const_value_type& alpha, const AVIT& A, + const BVIT& B); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct TRMM { - static void trmm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BVIT::const_value_type& alpha, - const AVIT& A, const BVIT& B) { +template +struct TRMM { + static void trmm(const execution_space& /*space*/, const char side[], + const char uplo[], const char trans[], const char diag[], + typename BVIT::const_value_type& alpha, const AVIT& A, + const BVIT& B) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); static_assert(Kokkos::is_view::value, "BVIT must be a Kokkos::View."); static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); @@ -121,6 +127,7 @@ struct TRMM { #define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ extern template struct TRMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -131,6 +138,7 @@ struct TRMM { #define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ template struct TRMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -154,6 +162,5 @@ struct TRMM { MEM_SPACE) #include -#include #endif // KOKKOSBLAS3_TRMM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_trsm_impl.hpp b/blas/impl/KokkosBlas3_trsm_impl.hpp index 9700b62e67..87cac8b86a 100644 --- a/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -40,7 +40,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const int as0, const int as1, /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); @@ -79,7 +79,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, const int as0, const int as1, /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index d05dad2275..93d01ed53b 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trsm_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct trsm_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct trsm_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -70,23 +71,25 @@ namespace Impl { // // Unification layer -template < - class AViewType, class BViewType, - bool tpl_spec_avail = trsm_tpl_spec_avail::value, - bool eti_spec_avail = trsm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + trsm_eti_spec_avail::value> struct TRSM { - static void trsm(const char side[], const char uplo[], const char trans[], - const char diag[], + static void trsm(const execution_space& space, const char side[], + const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B); }; // Implementation of KokkosBlas::trsm. #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct TRSM { - static void trsm(const char side[], const char uplo[], const char trans[], - const char diag[], +template +struct TRSM { + static void trsm(const execution_space& /*space*/, const char side[], + const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { static_assert(Kokkos::is_view::value, @@ -134,6 +137,7 @@ struct TRSM { #define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ extern template struct TRSM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -144,6 +148,7 @@ struct TRSM { #define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ template struct TRSM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -160,6 +165,5 @@ struct TRSM { MEM_SPACE) #include -#include #endif // KOKKOSBLAS3_TRSM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_gesv_spec.hpp b/blas/impl/KokkosBlas_gesv_spec.hpp index 8b554f7130..f1dff467c8 100644 --- a/blas/impl/KokkosBlas_gesv_spec.hpp +++ b/blas/impl/KokkosBlas_gesv_spec.hpp @@ -128,6 +128,5 @@ struct GESV { false, true>; #include -#include #endif // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_trtri_spec.hpp b/blas/impl/KokkosBlas_trtri_spec.hpp index 7b3e6b1fc3..2a4d2db576 100644 --- a/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/blas/impl/KokkosBlas_trtri_spec.hpp @@ -123,6 +123,5 @@ struct TRTRI { false, true>; #include -#include #endif // KOKKOSBLAS_TRTRI_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 0a96d05488..50173538fb 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -41,63 +41,6 @@ struct Trans { struct ConjTranspose {}; }; -#if !defined(KOKKOS_IF_ON_HOST) - -namespace Impl { - -template -struct algo_level3_blocked_mb_impl; -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 4; -}; -#if defined(KOKKOS_ENABLE_CUDA) -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 2; -}; -#endif -#if defined(KOKKOS_ENABLE_HIP) -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 2; -}; -#endif -#if defined(KOKKOS_ENABLE_SYCL) -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 2; -}; -#endif - -template -struct algo_level2_blocked_mb_impl; -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 4; -}; -#if defined(KOKKOS_ENABLE_CUDA) -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 1; -}; -#endif -#if defined(KOKKOS_ENABLE_HIP) -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 1; -}; -#endif -#if defined(KOKKOS_ENABLE_SYCL) -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 1; -}; -#endif - -} // namespace Impl -#endif - struct Algo { struct Level3 { struct Unblocked { @@ -111,19 +54,10 @@ struct Algo { // - team policy (smaller) or range policy (bigger) // - space (gpu vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. -#if defined(KOKKOS_IF_ON_HOST) static constexpr KOKKOS_FUNCTION int mb() { KOKKOS_IF_ON_HOST((return 4;)) KOKKOS_IF_ON_DEVICE((return 2;)) } - -#else // FIXME remove when requiring minimum version of Kokkos 3.6 - static constexpr KOKKOS_FUNCTION int mb() { - return algo_level3_blocked_mb_impl< - Kokkos::Impl::ActiveExecutionMemorySpace>::value; - } - -#endif }; struct MKL { static const char *name() { return "MKL"; } @@ -161,19 +95,10 @@ struct Algo { // - team policy (smaller) or range policy (bigger) // - space (cuda vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. -#if defined(KOKKOS_IF_ON_HOST) static constexpr KOKKOS_FUNCTION int mb() { KOKKOS_IF_ON_HOST((return 4;)) KOKKOS_IF_ON_DEVICE((return 1;)) } - -#else // FIXME remove when requiring minimum version of Kokkos 3.6 - static constexpr KOKKOS_FUNCTION int mb() { - return algo_level2_blocked_mb_impl< - Kokkos::Impl::ActiveExecutionMemorySpace>::value; - } - -#endif }; struct MKL {}; struct CompactMKL {}; diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index 969a0a3c40..bd63ccedf1 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -25,21 +25,38 @@ namespace KokkosBlas { /// \brief R(i,j) = abs(X(i,j)) /// -/// Replace each entry in R with the absolute value (magnitude) of the -/// corresponding entry in X. +/// Non-blocking function to replace each entry in R with the absolute value +/// (magnitude) of the corresponding entry in X. /// +/// \tparam execution_space a Kokkos execution space to run the kernels on. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void abs(const RMV& R, const XMV& X) { +/// +/// \param space [in] an execution_space instance where the kernel will run. +/// \param R [out] view of type RMV that contains the absolute value X on +/// output. +/// \param X [in] view of type XMV. +template +void abs(const execution_space& space, const RMV& R, const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::abs: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "R is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: RMV must be accessible from execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: XMV must be accessible from execution space"); static_assert(std::is_same::value, "KokkosBlas::abs: R is const. " @@ -63,24 +80,42 @@ void abs(const RMV& R, const XMV& X) { // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< + using RMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RMV::device_type, Kokkos::MemoryTraits > - RMV_Internal; - typedef Kokkos::View< + typename RMV::device_type, Kokkos::MemoryTraits >; + using XMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > - XMV_Internal; + typename XMV::device_type, Kokkos::MemoryTraits >; RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Abs::abs(R_internal, X_internal); + Impl::Abs::abs(space, R_internal, + X_internal); +} + +/// \brief R(i,j) = abs(X(i,j)) +/// +/// Non-blocking function to replace each entry in R with the absolute value +/// (magnitude) of the corresponding entry in X. The kernel is executed in the +/// default stream/queue associated with the execution space of RMV. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// \param R [out] view of type RMV that contains the absolute value X on +/// output. +/// \param X [in] view of type XMV. +template +void abs(const RMV& R, const XMV& X) { + abs(typename RMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index e2ec1dde0c..2f59cb4cce 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -30,23 +30,51 @@ namespace KokkosBlas { -template -void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { +/// \brief Computes Y := a*X + b*Y +/// +/// This function is non-blocking and thread safe. +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam BV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param b [in] view of type BV, scaling parameter for Y. +/// \param Y [in/out] view of type YMV in which the results will be stored. +template +void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, + const YMV& Y) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::axpby: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::axpby: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::axpby: XMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::axpby: " "Y is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::axpby: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::axpby: Y is const. It must be nonconst, " "because it is an output argument " "(we must be able to write to its entries)."); - static_assert(int(YMV::Rank) == int(XMV::Rank), + static_assert(int(YMV::rank) == int(XMV::rank), "KokkosBlas::axpby: " "X and Y must have the same rank."); - static_assert(YMV::Rank == 1 || YMV::Rank == 2, + static_assert(YMV::rank == 1 || YMV::rank == 2, "KokkosBlas::axpby: " "XMV and YMV must either have rank 1 or rank 2."); @@ -68,33 +96,88 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { // Create unmanaged versions of the input Views. XMV and YMV may be // rank 1 or rank 2. AV and BV may be either rank-1 Views, or // scalar values. - typedef Kokkos::View > - XMV_Internal; - typedef Kokkos::View > - YMV_Internal; - typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType< - AV, XMV_Internal, true>::type AV_Internal; - typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType< - BV, YMV_Internal, true>::type BV_Internal; + using XMV_Internal = Kokkos::View >; + using YMV_Internal = Kokkos::View >; + using AV_Internal = + typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; + using BV_Internal = + typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; AV_Internal a_internal = a; XMV_Internal X_internal = X; BV_Internal b_internal = b; YMV_Internal Y_internal = Y; - Impl::Axpby::axpby( - a_internal, X_internal, b_internal, Y_internal); + Impl::Axpby::axpby(space, a_internal, X_internal, b_internal, + Y_internal); +} + +/// \brief Computes Y := a*X + b*Y +/// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of XMV. +/// +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam BV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param b [in] view of type BV, scaling parameter for Y. +/// \param Y [in/out] view of type YMV in which the results will be stored. +template +void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { + axpby(typename XMV::execution_space{}, a, X, b, Y); +} + +/// \brief Computes Y := a*X + Y +/// +/// This function is non-blocking and thread-safe +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param Y [in/out] view of type YMV in which the results will be stored. +template +void axpy(const execution_space& space, const AV& a, const XMV& X, + const YMV& Y) { + axpby(space, a, X, + Kokkos::ArithTraits::one(), Y); } +/// \brief Computes Y := a*X + Y +/// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of XMV. +/// +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param Y [in/out] view of type YMV in which the results will be stored. template void axpy(const AV& a, const XMV& X, const YMV& Y) { - axpby(a, X, - Kokkos::Details::ArithTraits::one(), - Y); + axpy(typename XMV::execution_space{}, a, X, Y); } /// @@ -107,10 +190,10 @@ KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) { "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); static_assert(Kokkos::is_view::value, "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); - static_assert(XMV::Rank == 1 || XMV::Rank == 2, + static_assert(XMV::rank == 1 || XMV::rank == 2, "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); static_assert( - XMV::Rank == YMV::Rank, + XMV::rank == YMV::rank, "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index 4a5a18b976..ebccce7d7c 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -25,21 +25,38 @@ namespace KokkosBlas { /// \brief Return the dot product of the two vectors x and y. /// +/// \tparam execution_space the Kokkos execution space where the kernel +/// will be executed. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// \tparam YVector Type of the second vector y; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that may specify +/// in which stream/queue the kernel will be executed. /// \param x [in] Input 1-D View. /// \param y [in] Input 1-D View. /// /// \return The dot product result; a single value. -template +template , + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::dot_type -dot(const XVector& x, const YVector& y) { +dot(const execution_space& space, const XVector& x, const YVector& y) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::dot: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XVector must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: YVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: YVector must be accessible from execution_space"); static_assert((int)XVector::rank == (int)YVector::rank, "KokkosBlas::dot: Vector ranks do not match."); static_assert(XVector::rank == 1, @@ -55,16 +72,14 @@ dot(const XVector& x, const YVector& y) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits> - XVector_Internal; - typedef Kokkos::View< + typename XVector::device_type, Kokkos::MemoryTraits>; + using YVector_Internal = Kokkos::View< typename YVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits> - YVector_Internal; + typename YVector::device_type, Kokkos::MemoryTraits>; using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::dot_type; @@ -91,9 +106,10 @@ dot(const XVector& x, const YVector& y) { // 32-bit precision). Impl::Dot needs to support both cases, and it's easier // to do this with overloading than by extending the ETI to deal with two // different scalar types. - Impl::DotSpecialAccumulator::dot(R, X, Y); - Kokkos::fence(); + Impl::DotSpecialAccumulator::dot(space, R, + X, Y); + space.fence(); // mfh 22 Jan 2020: We need the line below because // Kokkos::complex lacks a constructor that takes a // Kokkos::complex with U != T. @@ -101,12 +117,37 @@ dot(const XVector& x, const YVector& y) { result); } +/// \brief Return the dot product of the two vectors x and y. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// \tparam YVector Type of the second vector y; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// \param y [in] Input 1-D View. +/// +/// \return The dot product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::dot_type +dot(const XVector& x, const YVector& y) { + return dot(typename XVector::execution_space{}, x, y); +} + /// \brief Compute the column-wise dot products of two multivectors. /// +/// This function is non-blocking and thread-safe. +/// +/// \tparam execution_space the Kokkos execution space where the kernel +/// will be executed. /// \tparam RV 0-D resp. 1-D output View /// \tparam XMV 1-D resp. 2-D input View /// \tparam YMV 1-D resp. 2-D input View /// +/// \param space [in] an execution space instance that may specify +/// in which stream/queue the kernel will be executed. /// \param R [out] Output 1-D or 0-D View to which to write results. /// \param X [in] Input 2-D or 1-D View. /// \param Y [in] Input 2-D or 1-D View. @@ -127,18 +168,29 @@ dot(const XVector& x, const YVector& y) { /// \note To implementers: We use enable_if here so that the compiler /// doesn't confuse this version of dot() with the three-argument /// version of dot() in Kokkos_Blas1.hpp. -template -void dot(const RV& R, const XMV& X, const YMV& Y, +template +void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::dot: excution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "Y is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::dot: R is const. " @@ -215,8 +267,44 @@ void dot(const RV& R, const XMV& X, const YMV& Y, XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; - Impl::Dot::dot( - R_internal, X_internal, Y_internal); + Impl::Dot::dot( + space, R_internal, X_internal, Y_internal); +} + +/// \brief Compute the column-wise dot products of two multivectors. +/// +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. +/// +/// \tparam RV 0-D resp. 1-D output View +/// \tparam XMV 1-D resp. 2-D input View +/// \tparam YMV 1-D resp. 2-D input View +/// +/// \param R [out] Output 1-D or 0-D View to which to write results. +/// \param X [in] Input 2-D or 1-D View. +/// \param Y [in] Input 2-D or 1-D View. +/// +/// This function implements a few different use cases: +///
    +///
  • If X and Y are both 1-D, then this is a single dot product. +/// R must be 0-D (a View of a single value).
  • +///
  • If X and Y are both 2-D, then this function computes their +/// dot products columnwise. R must be 1-D.
  • +///
  • If X is 2-D and Y is 1-D, then this function computes the dot +/// product of each column of X, with Y, in turn. R must be +/// 1-D.
  • +///
  • If X is 1-D and Y is 2-D, then this function computes the dot +/// product X with each column of Y, in turn. R must be 1-D.
  • +///
+/// +/// \note To implementers: We use enable_if here so that the compiler +/// doesn't confuse this version of dot() with the three-argument +/// version of dot() in Kokkos_Blas1.hpp. +template +void dot(const RV& R, const XMV& X, const YMV& Y, + typename std::enable_if::value, int>::type = 0) { + dot(typename XMV::execution_space{}, R, X, Y); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index 37aebb3c5d..403411f7b8 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -23,6 +23,29 @@ namespace KokkosBlas { /// \brief Fill the multivector or single vector X with the given value. /// +/// This function is non-blocking and thread-safe +/// +/// \tparam execution_space a Kokkos execution space +/// \tparam XMV 1-D or 2-D output View +/// +/// \param space [in] A Kokkos instance of execution_space on which the +/// kernel will run. +/// \param X [out] Output View (1-D or 2-D). +/// \param val [in] Value with which to fill the entries of X. +template +void fill(const execution_space& space, const XMV& X, + const typename XMV::non_const_value_type& val) { + Kokkos::Profiling::pushRegion("KokkosBlas::fill"); + Kokkos::deep_copy(space, X, val); + Kokkos::Profiling::popRegion(); +} + +/// \brief Fill the multivector or single vector X with the given value. +/// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of XMV. +/// /// \tparam XMV 1-D or 2-D output View /// /// \param X [out] Output View (1-D or 2-D). diff --git a/blas/src/KokkosBlas1_iamax.hpp b/blas/src/KokkosBlas1_iamax.hpp index 22411a70bb..cfaaaeed63 100644 --- a/blas/src/KokkosBlas1_iamax.hpp +++ b/blas/src/KokkosBlas1_iamax.hpp @@ -26,17 +26,29 @@ namespace KokkosBlas { /// \brief Return the (smallest) index of the element of the maximum magnitude /// of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] execution space instance where the kernel will run. /// \param x [in] Input 1-D View. /// /// \return The (smallest) index of the element of the maximum magnitude; a /// single value. /// Note: Returned index is 1-based for compatibility with Fortran. -template -typename XVector::size_type iamax(const XVector& x) { +template , + int>::type = 0> +typename XVector::size_type iamax(const execution_space& space, + const XVector& x) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::iamax: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::iamax: " "Both Vector inputs must have rank 1."); @@ -49,39 +61,71 @@ typename XVector::size_type iamax(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; index_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Iamax::iamax(R, X); - Kokkos::fence(); + Impl::Iamax::iamax(space, + R, X); + space.fence(); return result; } +/// \brief Return the (smallest) index of the element of the maximum magnitude +/// of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The (smallest) index of the element of the maximum magnitude; a +/// single value. +/// Note: Returned index is 1-based for compatibility with Fortran. +template +typename XVector::size_type iamax(const XVector& x) { + return iamax(typename XVector::execution_space{}, x); +} + /// \brief R(j) = iamax(X(i,j)) /// /// Replace each entry in R with the (smallest) index of the element of the /// maximum magnitude of the corresponding entry in X. +/// This function is non-blocking and thread-safe. /// /// \tparam RMV 0-D or 1-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// +/// \param space [in] execution space instance where the kernel will run. +/// \param R [out] Output View (rank 0 or 1) containing the results. +/// \param X [in] Input View (rank 1 or 2). +/// /// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a /// view, RMV must be 0-D view and XMV must be 1-D view. -template -void iamax(const RV& R, const XMV& X, +template +void iamax(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::iamax: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::iamax: R is const. " @@ -135,7 +179,27 @@ void iamax(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Iamax::iamax(R_internal, X_internal); + Impl::Iamax::iamax( + space, R_internal, X_internal); +} + +/// \brief R(j) = iamax(X(i,j)) +/// +/// Replace each entry in R with the (smallest) index of the element of the +/// maximum magnitude of the corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam RMV 0-D or 1-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// +/// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a +/// view, RMV must be 0-D view and XMV must be 1-D view. +template +void iamax(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + iamax(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index e08409e9aa..47fa1f536f 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -26,10 +26,15 @@ namespace KokkosBlas { /// \brief Element wise multiplication of two vectors: /// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] /// +/// This function is non-blocking and thread-safe +/// +/// \tparam execution_type a Kokkos execution space type. /// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. /// \tparam AV Type of the second vector A; a 1-D Kokkos::View. /// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. /// +/// \param space [in] An instance of execution_space on which the kernel +/// will run (it may specify an execution stream/queue). /// \param gamma [in] The scalar to apply to Y. /// \param Y [in/out] The Y vector. /// \param alpha [in] The scalar to apply to A. @@ -37,15 +42,34 @@ namespace KokkosBlas { /// \param X [in] The X vector. /// /// \return Y = gamma * Y + alpha * A * X. -template -void mult(typename YMV::const_value_type& gamma, const YMV& Y, - typename AV::const_value_type& alpha, const AV& A, const XMV& X) { +template +void mult(const execution_space& space, typename YMV::const_value_type& gamma, + const YMV& Y, typename AV::const_value_type& alpha, const AV& A, + const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::mult: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "Y is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: YMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "A is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert(Kokkos::is_view::value, + "KokkosBlas::mult: " + "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::mult: Y is const. " @@ -95,8 +119,32 @@ void mult(typename YMV::const_value_type& gamma, const YMV& Y, AV_Internal A_internal = A; XMV_Internal X_internal = X; - Impl::Mult::mult( - gamma, Y_internal, alpha, A_internal, X_internal); + Impl::Mult::mult( + space, gamma, Y_internal, alpha, A_internal, X_internal); +} + +/// \brief Element wise multiplication of two vectors: +/// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] +/// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of YMV. +/// +/// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. +/// \tparam AV Type of the second vector A; a 1-D Kokkos::View. +/// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. +/// +/// \param gamma [in] The scalar to apply to Y. +/// \param Y [in/out] The Y vector. +/// \param alpha [in] The scalar to apply to A. +/// \param A [in] The vector to apply to X. +/// \param X [in] The X vector. +/// +/// \return Y = gamma * Y + alpha * A * X. +template +void mult(typename YMV::const_value_type& gamma, const YMV& Y, + typename AV::const_value_type& alpha, const AV& A, const XMV& X) { + mult(typename YMV::execution_space{}, gamma, Y, alpha, A, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index 62f373d7b8..e9b26e6177 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -25,15 +25,24 @@ namespace KokkosBlas { /// \brief Return the nrm1 of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. /// \param x [in] Input 1-D View. /// /// \return The nrm1 product result; a single value. -template +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm1(const XVector& x) { +nrm1(const execution_space& space, const XVector& x) { + static_assert( + Kokkos::is_execution_space::value, + "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, @@ -55,22 +64,44 @@ nrm1(const XVector& x) { RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm1::nrm1(R, X); - Kokkos::fence(); + Impl::Nrm1::nrm1(space, + R, X); + space.fence(); return result; } +/// \brief Return the nrm1 of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrm1 product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm1(const XVector& x) { + return nrm1(typename XVector::execution_space{}, x); +} + /// \brief R(j) = nrm1(X(i,j)) /// /// Replace each entry in R with the nrm1olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void nrm1(const RV& R, const XMV& X, +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output 1-D View containing the result +/// \param X [in] Input 1-D View. +template +void nrm1(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: " @@ -87,6 +118,10 @@ void nrm1(const RV& R, const XMV& X, ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm1: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm1: execution_space cannot access data in XMV"); typedef typename Kokkos::Details::InnerProductSpaceTraits< typename XMV::non_const_value_type>::mag_type mag_type; @@ -128,7 +163,28 @@ void nrm1(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm1::nrm1(R_internal, X_internal); + Impl::Nrm1::nrm1( + space, R_internal, X_internal); +} + +/// \brief R(j) = nrm1(X(i,j)) +/// +/// Replace each entry in R with the nrm1olute value (magnitude) of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe. The kernel is executed in the +/// default stream/queue associated with the execution space of XMV. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// \param R [out] Output 1-D View containing the result +/// \param X [in] Input 1-D View. +template +void nrm1(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrm1(typename XMV::execution_space{}, R, X); } /// \brief Return the nrm1 of the vector x via asum (the actual blas name). diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index d3e2f03138..67cdde17fa 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -26,17 +26,30 @@ namespace KokkosBlas { /// \brief Return the nrm2 of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2(const XVector& x) { +nrm2(const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2: " "XVector must have rank 1."); @@ -49,38 +62,70 @@ nrm2(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Nrm2::nrm2(R, X, true); - Kokkos::fence(); + Impl::Nrm2::nrm2( + space, R, X, true); + space.fence(); return result; } +/// \brief Return the nrm2 of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrm2 product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2(const XVector& x) { + return nrm2(typename XVector::execution_space{}, x); +} + /// \brief R(i,j) = nrm2(X(i,j)) /// /// Replace each entry in R with the nrm2olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void nrm2(const RV& R, const XMV& X, +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +template +void nrm2(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2: space is not a Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: X cannot be accessed from execution_space."); static_assert(std::is_same::value, "KokkosBlas::nrm2: R is const. " @@ -127,7 +172,30 @@ void nrm2(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2(R_internal, X_internal, true); + Impl::Nrm2::nrm2( + space, R_internal, X_internal, true); +} + +/// \brief R(i,j) = nrm2(X(i,j)) +/// +/// Replace each entry in R with the nrm2olute value (magnitude) of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XMV. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +template +void nrm2(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrm2(typename XMV::execution_space{}, R, X); } /// @@ -140,7 +208,7 @@ serial_nrm2(const XMV X) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); - static_assert(XMV::Rank == 1, + static_assert(XMV::rank == 1, "KokkosBlas::serial_nrm2: XMV must have rank 1"); #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index 3a584c8a99..c065efb290 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -25,17 +25,31 @@ namespace KokkosBlas { /// \brief Return the nrm2 of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2_squared(const XVector& x) { +nrm2_squared(const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2_squared: execution_space must be a valid" + " Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2_squared: " "Both Vector inputs must have rank 1."); @@ -57,30 +71,63 @@ nrm2_squared(const XVector& x) { RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm2::nrm2(R, X, false); - Kokkos::fence(); + Impl::Nrm2::nrm2( + space, R, X, false); + space.fence(); return result; } +/// \brief Return the nrm2 of the vector x. +/// +/// The kernel is executed in thedefault stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrm2 product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2_squared(const XVector& x) { + return nrm2_squared(typename XVector::execution_space{}, x); +} + /// \brief R(i,j) = nrm2(X(i,j)) /// /// Replace each entry in R with the nrm2olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [in] Output View (rank 0 or 1) that holds the result. +/// \param X [in] Input View (rank 1 or 2). +template void nrm2_squared( - const RV& R, const XMV& X, + const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2_squared: execution_space must be a valid" + " Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrm2_squared: R is const. " @@ -126,7 +173,27 @@ void nrm2_squared( RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2(R_internal, X_internal, false); + Impl::Nrm2::nrm2( + space, R_internal, X_internal, false); +} + +/// \brief R(i,j) = nrm2(X(i,j)) +/// +/// Replace each entry in R with the nrm2olute value (magnitude) of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrm2_squared( + const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrm2_squared(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index 403d8ba685..c5eaa0621b 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -25,62 +25,111 @@ namespace KokkosBlas { /// \brief Return the nrm2w of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the computation +/// will be launched /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that may specify +/// a stream or queue for the compute kernel execution. /// \param x [in] Input 1-D View. +/// \param w [in] /// /// \return The nrm2w product result; a single value. -template +template typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2w(const XVector& x, const XVector& w) { +nrm2w(const execution_space& space, const XVector& x, const XVector& w, + typename std::enable_if< + Kokkos::is_execution_space::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type; - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + typename XVector::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View > - RVector_Internal; + using layout_t = typename XVector_Internal::array_layout; + + using RVector_Internal = + Kokkos::View >; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w(R, X, W, true); - Kokkos::fence(); + Impl::Nrm2w::nrm2w( + space, R, X, W, true); + space.fence(); return result; } +/// \brief Return the nrm2w of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// \param w [in] +/// +/// \return The nrm2w product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2w(const XVector& x, const XVector& w) { + return nrm2w(typename XVector::execution_space{}, x, w); +} + /// \brief R(i,j) = nrm2w(X(i,j)) /// /// Replace each entry in R with the nrm2w, absolute value (magnitude), of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void nrm2w(const RV& R, const XMV& X, const XMV& W, +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). +template +void nrm2w(const execution_space& space, const RV& R, const XMV& X, + const XMV& W, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrm2w: R is const. " @@ -128,9 +177,33 @@ void nrm2w(const RV& R, const XMV& X, const XMV& W, XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w(R_internal, X_internal, - W_internal, true); + Impl::Nrm2w::nrm2w( + space, R_internal, X_internal, W_internal, true); } + +/// \brief R(i,j) = nrm2w(X(i,j)) +/// +/// Replace each entry in R with the nrm2w, absolute value (magnitude), of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). +template +void nrm2w(const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { + nrm2w(typename XMV::execution_space{}, R, X, W); +} + } // namespace KokkosBlas #endif // KOKKOSBLAS1_NRM2W_HPP_ diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index d39dfb0432..a1fe10bf1e 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -25,63 +25,115 @@ namespace KokkosBlas { /// \brief Return the nrm2w of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the computation +/// will be launched /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that may specify /// \param x [in] Input 1-D View. +/// \param w [in] Input weights (1-D View). /// /// \return The nrm2w product result; a single value. -template +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const XVector& x, const XVector& w) { +nrm2w_squared(const execution_space& space, const XVector& x, + const XVector& w) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w_squared: execution_space must be a valid " + "Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w_squared: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type; - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + typename XVector::device_type, Kokkos::MemoryTraits >; + + using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > - RVector_Internal; + using RVector_Internal = + Kokkos::View >; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w(R, X, W, false); - Kokkos::fence(); + Impl::Nrm2w::nrm2w( + space, R, X, W, false); + space.fence(); return result; } +/// \brief Return the nrm2w of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// \param w [in] Input weights (1-D View). +/// +/// \return The nrm2w product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2w_squared(const XVector& x, const XVector& w) { + return nrm2w_squared(typename XVector::execution_space(), x, w); +} + /// \brief R(i,j) = nrm2w(X(i,j)) /// /// Replace each entry in R with the nrm2wolute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). +template void nrm2w_squared( - const RV& R, const XMV& X, const XMV& W, + const execution_space& space, const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w_squared: execution_space must be a valid " + "Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); static_assert(std::is_same::value, "KokkosBlas::nrm2w_squared: R is const. " @@ -91,8 +143,8 @@ void nrm2w_squared( ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w_squared: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XMV::non_const_value_type>::mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2w: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -115,21 +167,42 @@ void nrm2w_squared( // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > - RV_Internal; - typedef Kokkos::View > - XMV_Internal; + using RV_Internal = Kokkos::View >; + using XMV_Internal = Kokkos::View >; RV_Internal R_internal = R; XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w(R_internal, X_internal, - W_internal, false); + Impl::Nrm2w::nrm2w( + space, R_internal, X_internal, W_internal, false); +} + +/// \brief R(i,j) = nrm2w(X(i,j)) +/// +/// Replace each entry in R with the nrm2wolute value (magnitude) of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XMV. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). +template +void nrm2w_squared( + const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { + nrm2w_squared(typename XMV::execution_space{}, R, X, W); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index bd4bf080a9..c6f923aefe 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -25,15 +25,21 @@ namespace KokkosBlas { /// \brief Return the nrminf of the vector x. /// +/// \tparam execution_space The execution space in which the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that can specify computing +/// resources to be used, for instance a stream or queue. /// \param x [in] Input 1-D View. /// /// \return The nrminf product result; a single value. -template +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrminf(const XVector& x) { +nrminf(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, @@ -48,39 +54,62 @@ nrminf(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::NrmInf::nrminf(R, X); - Kokkos::fence(); + Impl::NrmInf::nrminf( + space, R, X); + space.fence(); return result; } +/// \brief Return the nrminf of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrminf product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrminf(const XVector& x) { + return nrminf(typename XVector::execution_space{}, x); +} + /// \brief R(j) = nrminf(X(i,j)) /// /// Replace each entry in R with the nrminfolute value (magnitude) of the /// corresponding entry in X. /// +/// \tparam execution_space, the execution space in which the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template +template void nrminf( - const RV& R, const XMV& X, + const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrminf: space is not an execution space instance"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrminf: X is not accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrminf: R is const. " @@ -131,7 +160,24 @@ void nrminf( RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::NrmInf::nrminf(R_internal, X_internal); + Impl::NrmInf::nrminf( + space, R_internal, X_internal); +} + +/// \brief R(j) = nrminf(X(i,j)) +/// +/// Replace each entry in R with the nrminfolute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrminf( + const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrminf(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 19624d11c9..ef73d26828 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -27,19 +27,37 @@ namespace KokkosBlas { /// /// Replace each entry in R with the absolute value (magnitude), of the /// reciprocal of the corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void reciprocal(const RMV& R, const XMV& X) { +/// +/// \param space [in] an instance of execution space where the kernel will run +/// \param R [out] a view of type RMV that contains the inverse of the values in +/// X. +/// \param X [in] a view of type XMV that contains the values to invert. +template +void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::reciprocal: execution_space must be a valid " + "Kokkos execition space."); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "R is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::reciprocal: R is const. " @@ -80,8 +98,25 @@ void reciprocal(const RMV& R, const XMV& X) { RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Reciprocal::reciprocal(R_internal, - X_internal); + Impl::Reciprocal::reciprocal( + space, R_internal, X_internal); +} + +/// \brief R(i,j) = reciprocal(X(i,j)) +/// +/// Replace each entry in R with the absolute value (magnitude), of the +/// reciprocal of the corresponding entry in X. +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of RMV. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void reciprocal(const RMV& R, const XMV& X) { + reciprocal(typename RMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_rotg.hpp b/blas/src/KokkosBlas1_rotg.hpp index 6b26ce57ec..3b66ae0115 100644 --- a/blas/src/KokkosBlas1_rotg.hpp +++ b/blas/src/KokkosBlas1_rotg.hpp @@ -26,10 +26,14 @@ namespace KokkosBlas { /// /// \tparam Scalar data type of inputs and outputs /// +/// \param space [in] the execution space /// \param a [in/out] on input one of the values to rotate, on output the -/// rotated value \param b [in/out] on input one of the values to rotate, on -/// output the rotated value \param c [out] cosine value associated with the -/// rotation \param s [out] sine value associated with the rotation +/// rotated value +/// \param b [in/out] on input one of the values to rotate, on +/// output the rotated value +/// \param c [out] cosine value associated with the +/// rotation +/// \param s [out] sine value associated with the rotation template void rotg(execution_space const& space, SViewType const& a, SViewType const& b, MViewType const& c, SViewType const& s) { diff --git a/blas/src/KokkosBlas1_rotm.hpp b/blas/src/KokkosBlas1_rotm.hpp index 4b6a45210a..077d3350fe 100644 --- a/blas/src/KokkosBlas1_rotm.hpp +++ b/blas/src/KokkosBlas1_rotm.hpp @@ -25,14 +25,14 @@ namespace KokkosBlas { /// \brief Applies modified Givens rotation coefficients to vectors x and y. /// /// \tparam execution_space the execution space where the kernel will be -/// executed, it can be used to specify a stream too. +/// executed, it can be used to specify a stream too. /// \tparam VectorView a rank1 view type that hold non const data /// \tparam ParamView a rank1 view of static extent [5] type that -/// holds const data +/// holds const data /// /// \param space [in] execution space used for parallel loops in this kernel -/// \param x1 [in/out] vector to be rotated with param coefficients -/// \param y1 [in/out] vector to be rotated with param coefficients +/// \param X [in/out] vector to be rotated with param coefficients +/// \param Y [in/out] vector to be rotated with param coefficients /// \param param [in] output of rotmg contains rotation coefficients /// template diff --git a/blas/src/KokkosBlas1_rotmg.hpp b/blas/src/KokkosBlas1_rotmg.hpp index 9d1f87cca1..723b0eac1a 100644 --- a/blas/src/KokkosBlas1_rotmg.hpp +++ b/blas/src/KokkosBlas1_rotmg.hpp @@ -25,10 +25,13 @@ namespace KokkosBlas { /// \brief Compute the coefficients to apply a modified Givens rotation. /// /// \tparam execution_space the execution space where the kernel will be -/// executed \tparam DXView a rank0 view type that hold non const data \tparam -/// YView a rank0 view type that holds const data \tparam PView a rank1 view of -/// static extent 5 that holds non const data +/// executed +/// \tparam DXView a rank0 view type that hold non const data +/// \tparam YView a rank0 view type that holds const data +/// \tparam PView a rank1 view of +/// static extent 5 that holds non const data /// +/// \param space [in] execution space used for parallel loops /// \param d1 [in/out] /// \param d2 [in/out] /// \param x1 [in/out] diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 8b61936460..39c197f352 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -29,14 +29,44 @@ namespace KokkosBlas { -template -void scal(const RMV& R, const AV& a, const XMV& X) { +/// \brief Computes R := alpha*X +/// +/// This function is non-blocking and thread-safe +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param R [in/out] view of type RMV in which the results will be stored. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +template +void scal(const execution_space& space, const RMV& R, const AV& a, + const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::scal: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "R is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: RMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: XMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::scal: XMV must be assignable to RMV"); static_assert(std::is_same::value, "KokkosBlas::scal: R is const. " @@ -67,23 +97,41 @@ void scal(const RMV& R, const AV& a, const XMV& X) { // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. AV may be either a rank-1 View, or a scalar // value. - typedef Kokkos::View > - RMV_Internal; - typedef Kokkos::View > - XMV_Internal; - typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType< - AV, XMV_Internal, true>::type AV_Internal; + using RMV_Internal = Kokkos::View >; + using XMV_Internal = Kokkos::View >; + using AV_Internal = + typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; RMV_Internal R_internal = R; AV_Internal a_internal = a; XMV_Internal X_internal = X; - Impl::Scal::scal( - R_internal, a_internal, X_internal); + Impl::Scal::scal( + space, R_internal, a_internal, X_internal); +} + +/// \brief Computes R := alpha*X +/// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of YMV. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// +/// \param R [in/out] view of type RMV in which the results will be stored. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +template +void scal(const RMV& R, const AV& a, const XMV& X) { + scal(typename RMV::execution_space{}, R, a, X); } /// diff --git a/blas/src/KokkosBlas1_sum.hpp b/blas/src/KokkosBlas1_sum.hpp index 0214feaf15..88c7b10021 100644 --- a/blas/src/KokkosBlas1_sum.hpp +++ b/blas/src/KokkosBlas1_sum.hpp @@ -25,58 +25,98 @@ namespace KokkosBlas { /// \brief Return the sum of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] execution space instance where the kernel will run. /// \param x [in] Input 1-D View. /// /// \return The sum product result; a single value. -template -typename XVector::non_const_value_type sum(const XVector& x) { +template , + int>::type = 0> +typename XVector::non_const_value_type sum(const execution_space& space, + const XVector& x) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::sum: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::sum: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XVector must be accessible from execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::sum: " "Both Vector inputs must have rank 1."); - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + typename XVector::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View > - RVector_Internal; + using layout_t = typename XVector_Internal::array_layout; + + using RVector_Internal = + Kokkos::View >; typename XVector::non_const_value_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Sum::sum(R, X); - Kokkos::fence(); + Impl::Sum::sum(space, R, + X); + space.fence(); return result; } +/// \brief Return the sum of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The sum product result; a single value. +template +typename XVector::non_const_value_type sum(const XVector& x) { + return sum(typename XVector::execution_space{}, x); +} + /// \brief R(j) = sum(X(i,j)) /// /// Replace each entry in R with the sumolute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void sum(const RV& R, const XMV& X, +/// +/// \param space [in] execution space instance where the kernel will run. +/// \param R [out] Output View (rank 0 or 1) containing the results. +/// \param X [in] Input View (rank 1 or 2). +template +void sum(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::sum: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::sum: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::sum: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::sum: R is const. " @@ -116,7 +156,29 @@ void sum(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Sum::sum(R_internal, X_internal); + Impl::Sum::sum(space, R_internal, + X_internal); +} + +/// \brief R(j) = sum(X(i,j)) +/// +/// Replace each entry in R with the sumolute value (magnitude) of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// \param R [out] Output View (rank 0 or 1) containing the results. +/// \param X [in] Input View (rank 1 or 2). +template +void sum(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + sum(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_team_axpby.hpp b/blas/src/KokkosBlas1_team_axpby.hpp index 165683df01..374bc42390 100644 --- a/blas/src/KokkosBlas1_team_axpby.hpp +++ b/blas/src/KokkosBlas1_team_axpby.hpp @@ -37,9 +37,7 @@ axpy(const TeamType& team, const typename XVector::non_const_value_type& a, const XVector& x, const YVector& y) { KokkosBlas::Experimental::axpby( team, a, x, - Kokkos::Details::ArithTraits< - typename YVector::non_const_value_type>::one(), - y); + Kokkos::ArithTraits::one(), y); } } // namespace Experimental diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index 741dc508fb..889f9ede32 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -25,6 +25,9 @@ namespace KokkosBlas { /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. /// +/// This function is non-blocking and thread-safe +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as XMV. @@ -32,10 +35,22 @@ namespace KokkosBlas { /// the same rank as XMV and YMV, and it must make sense to add up /// the entries of XMV and YMV and assign them to the entries of /// ZMV. -template -void update(const typename XMV::non_const_value_type& alpha, const XMV& X, +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param alpha [in] scaling parameter for X +/// \param X [in] input view of type XMV +/// \param beta [in] scaling parameter for Y +/// \param Y [in] input view of type YMV +/// \param gamma [in] scaling parameter for Z +/// \param Z [in/out] view of type ZMV in which the results will be stored. +template +void update(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::update: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "X is not a Kokkos::View."); @@ -45,6 +60,18 @@ void update(const typename XMV::non_const_value_type& alpha, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "Z is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: XMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: YMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: ZMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::update: Z is const. " @@ -74,27 +101,24 @@ void update(const typename XMV::non_const_value_type& alpha, const XMV& X, // Create unmanaged versions of the input Views. XMV, YMV, and ZMV // may be rank 1 or rank 2, but they must all have the same rank. - typedef Kokkos::View< + using XMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > - XMV_Internal; + typename XMV::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View< + using YMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YMV::device_type, Kokkos::MemoryTraits > - YMV_Internal; + typename YMV::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View< + using ZMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename ZMV::device_type, Kokkos::MemoryTraits > - ZMV_Internal; + typename ZMV::device_type, Kokkos::MemoryTraits >; XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; @@ -110,10 +134,37 @@ void update(const typename XMV::non_const_value_type& alpha, const XMV& X, << endl; #endif // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO - return Impl::Update::update( - alpha, X_internal, beta, Y_internal, gamma, Z_internal); + Impl::Update::update(space, alpha, X_internal, beta, Y_internal, + gamma, Z_internal); } +/// \brief Compute Z := alpha*X + beta*Y + gamma*Z. +/// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of ZMV. +/// +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// \tparam ZMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV and YMV, and it must make sense to add up +/// the entries of XMV and YMV and assign them to the entries of +/// ZMV. +/// +/// \param alpha [in] scaling parameter for X +/// \param X [in] input view of type XMV +/// \param beta [in] scaling parameter for Y +/// \param Y [in] input view of type YMV +/// \param gamma [in] scaling parameter for Z +/// \param Z [in/out] view of type ZMV in which the results will be stored. +template +void update(const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { + update(typename ZMV::execution_space{}, alpha, X, beta, Y, gamma, Z); +} } // namespace KokkosBlas #endif // KOKKOSBLAS1_UPDATE_HPP_ diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 9dfddff362..a8ebf02ca3 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -16,7 +16,7 @@ #ifndef KOKKOSBLAS2_GEMV_HPP_ #define KOKKOSBLAS2_GEMV_HPP_ -/// \file Kokkos_Blas2_MV.hpp +/// \file KokkosBlas2_gemv.hpp /// \brief BLAS 2 kernels specifically optimized for typical /// Tpetra::MultiVector use cases. @@ -49,23 +49,47 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template -void gemv(const typename AViewType::execution_space& space, const char trans[], +template +void gemv(const execution_space& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::gemv: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); + "KokkosBlas::gemv: AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); + "KokkosBlas::gemv: XViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); + "KokkosBlas::gemv: YViewType must be a Kokkos::View."); static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); + "KokkosBlas::gemv: AViewType must have rank 2."); static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); + "KokkosBlas::gemv: XViewType must have rank 1."); static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); + "KokkosBlas::gemv: YViewType must have rank 1."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: AViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: XViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: YViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemv: AViewType must be assignable to YViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemv: XViewType must be assignable to YViewType"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { @@ -175,9 +199,7 @@ template void gemv(const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - gemv(space, trans, alpha, A, x, beta, y); + gemv(typename AViewType::execution_space{}, trans, alpha, A, x, beta, y); } namespace Experimental { diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp new file mode 100644 index 0000000000..fbfc9c1f98 --- /dev/null +++ b/blas/src/KokkosBlas2_ger.hpp @@ -0,0 +1,153 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_HPP_ +#define KOKKOSBLAS2_GER_HPP_ + +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. +/// +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param alpha [in] Input coefficient of x * y^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void ger(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from XViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from YViewType"); + + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, + "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, + "YViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if ((A.extent(0) != x.extent(0)) || (A.extent(1) != y.extent(0))) { + std::ostringstream os; + os << "KokkosBlas::ger: Dimensions of A, x, and y do not match: " + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0) << ", y has size " << y.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] + << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { + return; + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::GER instantiations, by standardizing + // on particular View specializations for its template parameters. + typedef Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits > + XVT; + + typedef Kokkos::View::array_layout, + typename YViewType::device_type, + Kokkos::MemoryTraits > + YVT; + + typedef Kokkos::View > + AVT; + + Impl::GER::ger(space, trans, alpha, x, y, A); +} + +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param alpha [in] Input coefficient of x * y^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void ger(const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + ger( + space, trans, alpha, x, y, A); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_GER_HPP_ diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index 586302cb01..0cb00c8493 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -38,12 +38,13 @@ namespace Impl { // This case must be intercepted here rather than impl in order to call TPL // GEMV instead of TPL GEMM. This codepath was measured to be profitable with // cuBLAS. -template +template bool gemv_based_gemm( - const typename CViewType::execution_space& space, const char transA[], - const char transB[], typename AViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B, - typename CViewType::const_value_type& beta, const CViewType& C, + const execution_space& space, const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B, typename CViewType::const_value_type& beta, + const CViewType& C, typename std::enable_if::value && !std::is_same -void gemm(const typename CViewType::execution_space& space, const char transA[], +template +void gemm(const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::gemm: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); + "KokkosBlas::gemm: AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); + "KokkosBlas::gemm: BViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); + "KokkosBlas::gemm: CViewType must be a Kokkos::View."); static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); + "KokkosBlas::gemm: AViewType must have rank 2."); static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); + "KokkosBlas::gemm: BViewType must have rank 2."); static_assert(static_cast(CViewType::rank) == 2, - "CViewType must have rank 2."); + "KokkosBlas::gemm: CViewType must have rank 2."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: AViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: BViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: CViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemm: CViewType must be assignable by AViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemm: CViewType must be assignable by BViewType"); // Check validity of transpose argument bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || @@ -197,7 +222,7 @@ void gemm(const typename CViewType::execution_space& space, const char transA[], typename CViewType::device_type, Kokkos::MemoryTraits> CVT; - typedef Impl::GEMM impl_type; + typedef Impl::GEMM impl_type; impl_type::gemm(space, transA, transB, alpha, A, B, beta, C); } @@ -223,9 +248,8 @@ void gemm(const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { - const typename CViewType::execution_space space = - typename CViewType::execution_space(); - gemm(space, transA, transB, alpha, A, B, beta, C); + gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, + C); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trmm.hpp b/blas/src/KokkosBlas3_trmm.hpp index 7e2cbd5b88..bdc86d4d9e 100644 --- a/blas/src/KokkosBlas3_trmm.hpp +++ b/blas/src/KokkosBlas3_trmm.hpp @@ -27,15 +27,22 @@ namespace KokkosBlas { -/// \brief Solve triangular linear system with multiple RHSs: +/// \brief Triangular matrix multiply: +/// /// B = alpha * op(A) * B if side == "L" or "l" /// B = alpha * B * op(A) if side == "R" or "r" /// +/// This function is currently blocking when running the native implementation +/// which only has a serial implementation. +/// +/// \tparam execution_space a Kokkos execution space to run the kernels on. /// \tparam AViewType Input matrix, as a 2-D Kokkos::View /// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D /// Kokkos::View /// -/// \param side [in] "L" or "l" indicates matrix A is on the left of B +/// \param space [in] an execution space instance that may contain a stream +/// or a queue to execute the kernel on, this only works with TPLs at the +/// moment. \param side [in] "L" or "l" indicates matrix A is on the left of B /// "R" or "r" indicates matrix A is on the right of B /// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular /// matrix @@ -56,10 +63,11 @@ namespace KokkosBlas { /// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View /// On entry, M-by-N matrix /// On exit, overwritten with the solution -template -void trmm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +template +void trmm(const execution_space& space, const char side[], const char uplo[], + const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B) { static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -143,8 +151,46 @@ void trmm(const char side[], const char uplo[], const char trans[], typename BViewType::device_type, Kokkos::MemoryTraits >; - KokkosBlas::Impl::TRMM::trmm( - side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRMM::trmm(space, side, uplo, trans, + diag, alpha, A, B); +} + +/// \brief Solve triangular linear system with multiple RHSs: +/// B = alpha * op(A) * B if side == "L" or "l" +/// B = alpha * B * op(A) if side == "R" or "r" +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D +/// Kokkos::View +/// +/// \param side [in] "L" or "l" indicates matrix A is on the left of B +/// "R" or "r" indicates matrix A is on the right of B +/// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular +/// matrix +/// "L" or "l" indicates matrix A is a lower triangular matrix +/// \param trans [in] Specifies what op does to A: +// "N" or "n" for non-transpose, +// "T" or "t" for transpose, +// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicates the diagonal of A is assumed to be +// non-unit +/// \param alpha [in] Input coefficient used for +// multiplication with either A or B +/// \param A [in] Input matrix, as a 2-D Kokkos::View +/// If side == "L" or "l", matrix A is a M-by-M triangular +/// matrix; otherwise, matrix A is a N-by-N triangular matrix +/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View +/// On entry, M-by-N matrix +/// On exit, overwritten with the solution +template +void trmm(const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B) { + trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, + B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 2e8d2f4cfa..890b2ff6aa 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -30,12 +30,17 @@ namespace KokkosBlas { /// \brief Solve triangular linear system with multiple RHSs: /// op(A)*X = alpha*B if side == "L" or "l" /// X*op(A) = alpha*B if side == "R" or "r" +/// This function is currently blocking when running the native implementation +/// which only has a serial implementation. /// +/// \tparam execution_space a Kokkos execution space to run the kernels on. /// \tparam AViewType Input matrix, as a 2-D Kokkos::View /// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D /// Kokkos::View /// -/// \param side [in] "L" or "l" indicates matrix A is on the left of X +/// \param space [in] an execution space instance that may contain a stream +/// or a queue to execute the kernel on, this only works with TPLs at the +/// moment. \param side [in] "L" or "l" indicates matrix A is on the left of X /// "R" or "r" indicates matrix A is on the right of X /// \param uplo [in] "U" or "u" indicates matrix A upper part is stored, the /// other part is not referenced @@ -54,10 +59,11 @@ namespace KokkosBlas { /// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X -template -void trsm(const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +template +void trsm(const execution_space& space, const char side[], const char uplo[], + const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B) { static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -141,9 +147,44 @@ void trsm(const char side[], const char uplo[], const char trans[], typename BViewType::device_type, Kokkos::MemoryTraits >; - KokkosBlas::Impl::TRSM::trsm(side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRSM::trsm( + space, side, uplo, trans, diag, alpha, A, B); } +/// \brief Solve triangular linear system with multiple RHSs: +/// op(A)*X = alpha*B if side == "L" or "l" +/// X*op(A) = alpha*B if side == "R" or "r" +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D +/// Kokkos::View +/// +/// \param side [in] "L" or "l" indicates matrix A is on the left of X +/// "R" or "r" indicates matrix A is on the right of X +/// \param uplo [in] "U" or "u" indicates matrix A upper part is stored, the +/// other part is not referenced +/// "L" or "l" indicates matrix A lower part is stored, the +/// other part is not referenced +/// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, +/// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicated the diagonal of A is assumed to be +// non-unit +/// \param alpha [in] Input coefficient used for multiplication with B +/// \param A [in] Input matrix, as a 2-D Kokkos::View +/// If side == "L" or "l", matrix A is a M-by-M triangular +/// matrix; otherwise, matrix A is a N-by-N triangular matrix +/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View +/// On entry, M-by-N matrix of multile RHS +/// On exit, overwritten with the solution X +template +void trsm(const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B) { + trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, + B); +} } // namespace KokkosBlas #endif // KOKKOS_BLAS3_TRSM_HPP_ diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index 22556bc35a..b1a34f0483 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -28,6 +28,7 @@ namespace KokkosBlas { /// \brief Find the inverse of the triangular matrix, A +/// /// A = inv(A) /// /// \tparam AViewType Input matrix, as a 2-D Kokkos::View diff --git a/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp index 9f66f8ee61..9fada3ff9c 100644 --- a/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct abs_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 7ae8ef87b3..e2b04e300d 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_tpl_spec_avail { enum : bool { value = false }; }; @@ -36,7 +37,7 @@ namespace Impl { #define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct axpby_tpl_spec_avail< \ - SCALAR, \ + ExecSpace, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -63,7 +64,7 @@ KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, #define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct axpby_tpl_spec_avail< \ - SCALAR, \ + ExecSpace, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 74fa4265d8..65154b9985 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -43,7 +43,7 @@ namespace Impl { #define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ - double, \ + ExecSpace, double, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ double, \ @@ -60,8 +60,8 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ axpby_print_specialization(); \ @@ -69,8 +69,8 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -78,7 +78,7 @@ namespace Impl { #define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ - float, \ + ExecSpace, float, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ float, \ @@ -95,8 +95,8 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ axpby_print_specialization(); \ @@ -104,94 +104,94 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy( \ + N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy( \ + N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) @@ -225,7 +225,7 @@ namespace Impl { #define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ - double, \ + ExecSpace, double, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ double, \ @@ -243,8 +243,8 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast(INT_MAX)) && (beta == 1.0)) { \ @@ -253,10 +253,14 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -264,7 +268,7 @@ namespace Impl { #define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ - float, \ + ExecSpace, float, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ float, \ @@ -282,8 +286,8 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ @@ -292,101 +296,112 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZaxpy(s.handle, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one); \ - } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy( \ + s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one); \ - } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy( \ + s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 97f4be71da..ca2139980d 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -20,8 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct dot_tpl_spec_avail { enum : bool { value = false }; }; @@ -37,6 +37,7 @@ namespace Impl { #define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct dot_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -61,27 +62,31 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS // double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct dot_tpl_spec_avail< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1> { \ enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) #endif diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index e7054b1113..718e32f14c 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -43,6 +43,7 @@ namespace Impl { #define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Dot< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -59,7 +60,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -68,49 +69,53 @@ namespace Impl { int one = 1; \ R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ - } else { \ - Dot::dot(R, X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Dot< \ + ExecSpace, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ + } else { \ + Dot::dot(space, R, \ + X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -128,7 +133,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::dot[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -140,7 +145,8 @@ namespace Impl { N, reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -148,7 +154,8 @@ namespace Impl { #define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -166,7 +173,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::dot[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -178,7 +185,8 @@ namespace Impl { N, reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -212,27 +220,28 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Dot< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -243,113 +252,119 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ - } else { \ - Dot::dot(R, X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot< \ + EXECSPACE, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + } else { \ + Dot::dot(space, R, \ + X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotc(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ - } else { \ - Dot::dot(R, X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasZdotc(s.handle, N, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(&R())); \ + } else { \ + Dot::dot(space, R, \ + X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ + template <> \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -363,31 +378,32 @@ namespace Impl { one, reinterpret_cast(Y.data()), one, \ reinterpret_cast(&R())); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 37b61a2361..616c26c87a 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct iamax_tpl_spec_avail { enum : bool { value = false }; }; @@ -37,6 +37,7 @@ namespace Impl { MEMSPACE) \ template \ struct iamax_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -61,20 +62,24 @@ KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, // double #define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ MEMSPACE) \ - template \ + template <> \ struct iamax_tpl_spec_avail< \ + Kokkos::Cuda, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; \ - template \ + template <> \ struct iamax_tpl_spec_avail< \ - Kokkos::View, \ + Kokkos::Cuda, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ @@ -126,51 +131,47 @@ KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template \ - struct iamax_tpl_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template \ - struct iamax_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ + MEMSPACE) \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp index 958bc4d218..913ec5a151 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp @@ -49,7 +49,8 @@ namespace Impl { #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS( \ SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Iamax >, \ Kokkos::View, \ @@ -64,7 +65,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void iamax(RV& R, const XV& X) { \ + static void iamax(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const size_type numElems = X.extent(0); \ @@ -81,7 +82,8 @@ namespace Impl { N, reinterpret_cast(X.data()), LDX); \ R() = static_cast(idx); \ } else { \ - Iamax::iamax(R, X); \ + Iamax::iamax(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -148,25 +150,27 @@ using CUBLASUVM_DEVICE_TYPE = #endif #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, RET_DEVICE_TYPE, CUBLAS_PTR_MODE_1, CUBLAS_PTR_MODE_2) \ - template \ - struct Iamax \ + struct Iamax >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void iamax(RV& R, const XV& X) { \ + static void iamax(const EXEC_SPACE& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -181,6 +185,8 @@ using CUBLASUVM_DEVICE_TYPE = const int LDX = (XST == 0) ? 1 : XST; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t prevPtrMode; \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasGetPointerMode(s.handle, &prevPtrMode)); \ @@ -194,9 +200,11 @@ using CUBLASUVM_DEVICE_TYPE = if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } \ } else { \ - Iamax::iamax(R, X); \ + Iamax::iamax(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -206,26 +214,26 @@ using CUBLASUVM_DEVICE_TYPE = CUBLAS_FN, INDEX_TYPE, LAYOUT, \ MEMSPACE, ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, Kokkos::HostSpace, CUBLAS_POINTER_MODE_HOST, \ - CUBLAS_POINTER_MODE_DEVICE) \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, CUBLAS_POINTER_MODE_DEVICE, \ - CUBLAS_POINTER_MODE_HOST) + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #if defined(KOKKOS_ENABLE_CUDA_UVM) -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, Kokkos::HostSpace, CUBLAS_POINTER_MODE_HOST, \ - CUBLAS_POINTER_MODE_DEVICE) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, CUBLAS_POINTER_MODE_DEVICE, \ - CUBLAS_POINTER_MODE_HOST) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #endif #define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ @@ -372,30 +380,31 @@ KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, namespace KokkosBlas { namespace Impl { -using ROCBLAS_DEVICE_TYPE = - Kokkos::Device; +using ROCBLAS_DEVICE_TYPE = Kokkos::Device; #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, ROCBLAS_PTR_MODE_1, \ ROCBLAS_PTR_MODE_2) \ - template \ - struct Iamax \ + struct Iamax >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::HIP; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void iamax(RV& R, const XV& X) { \ + static void iamax(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -410,6 +419,8 @@ using ROCBLAS_DEVICE_TYPE = const int LDX = (XST == 0) ? 1 : XST; \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode prevPtrMode; \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ @@ -421,12 +432,14 @@ using ROCBLAS_DEVICE_TYPE = ROCBLAS_FN(s.handle, N, \ reinterpret_cast(X.data()), \ LDX, reinterpret_cast(R.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ } \ } else { \ - Iamax::iamax(R, X); \ + Iamax::iamax(space, \ + R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -469,44 +482,44 @@ using ROCBLAS_DEVICE_TYPE = INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp index 5b0b5662ba..8d3fc0f4d2 100644 --- a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index a2ce0d4390..04ec811990 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm1_tpl_spec_avail { enum : bool { value = false }; }; @@ -36,6 +36,7 @@ namespace Impl { #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm1_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -63,6 +64,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm1_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -84,6 +86,33 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #endif +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View< \ + typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) + +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 559615d105..b5b6e061ec 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -42,6 +42,7 @@ namespace Impl { #define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -56,7 +57,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -65,7 +66,7 @@ namespace Impl { int one = 1; \ R() = HostBlas::asum(N, X.data(), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -74,6 +75,7 @@ namespace Impl { #define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -88,7 +90,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -97,7 +99,7 @@ namespace Impl { int one = 1; \ R() = HostBlas::asum(N, X.data(), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -105,7 +107,8 @@ namespace Impl { #define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -120,7 +123,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -131,7 +134,7 @@ namespace Impl { R() = HostBlas >::asum( \ N, reinterpret_cast*>(X.data()), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -139,7 +142,8 @@ namespace Impl { #define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -154,7 +158,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -165,7 +169,7 @@ namespace Impl { R() = HostBlas >::asum( \ N, reinterpret_cast*>(X.data()), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -203,25 +207,27 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm1< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -230,33 +236,40 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDasum(s.handle, N, X.data(), one, R.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, \ + R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm1< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -265,18 +278,224 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSasum(s.handle, N, X.data(), one, R.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, \ + R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( \ + s.handle, N, reinterpret_cast(X.data()), \ + one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, \ + R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScasum( \ + s.handle, N, reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, \ + R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1< \ + ExecSpace, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -291,83 +510,92 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ nrm1_print_specialization(); \ const int N = static_cast(numElems); \ constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDzasum(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data()); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( \ + s.handle, N, \ + reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasScasum(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data()); \ - } else { \ - Nrm1::nrm1(R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( \ + s.handle, N, \ + reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 8b5476fd40..a58c90d8e9 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2_tpl_spec_avail { enum : bool { value = false }; }; @@ -35,6 +35,7 @@ namespace Impl { #define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm2_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -60,13 +61,15 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS // double #define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ + template <> \ struct nrm2_tpl_spec_avail< \ + Kokkos::Cuda, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 11a8894ca6..5e017cb7e1 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -42,6 +42,7 @@ namespace Impl { #define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm2< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -56,7 +57,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -66,7 +68,8 @@ namespace Impl { R() = HostBlas::nrm2(N, X.data(), int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -75,6 +78,7 @@ namespace Impl { #define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm2< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -89,7 +93,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -99,7 +104,8 @@ namespace Impl { R() = HostBlas::nrm2(N, X.data(), int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -107,7 +113,8 @@ namespace Impl { #define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm2 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -122,7 +129,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -135,7 +143,8 @@ namespace Impl { int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -143,7 +152,8 @@ namespace Impl { #define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm2 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -158,7 +168,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -171,7 +182,8 @@ namespace Impl { int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -209,61 +221,70 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm2< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDnrm2(s.handle, N, X.data(), int_one, &R()); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2< \ + EXECSPACE, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int int_one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDnrm2(s.handle, N, X.data(), int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm2< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -272,73 +293,88 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSnrm2(s.handle, N, X.data(), int_one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSnrm2(s.handle, N, X.data(), int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDznrm2(s.handle, N, \ - reinterpret_cast(X.data()), \ - int_one, &R()); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int int_one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDznrm2( \ + s.handle, N, reinterpret_cast(X.data()), \ + int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 \ + struct Nrm2 >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -348,36 +384,40 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasScnrm2(s.handle, N, \ - reinterpret_cast(X.data()), int_one, \ - &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScnrm2( \ + s.handle, N, reinterpret_cast(X.data()), \ + int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp index 7613190645..8a45b46521 100644 --- a/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2w_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 2fd3da50ee..88591fbf0c 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrminf_tpl_spec_avail { enum : bool { value = false }; }; @@ -36,6 +36,7 @@ namespace Impl { #define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrminf_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index aad5bbd7d4..17ec54e057 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -43,6 +43,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct NrmInf< \ + ExecSpace, \ Kokkos::View>, \ Kokkos::View, \ @@ -58,7 +59,7 @@ namespace Impl { typedef typename XV::size_type size_type; \ typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ - static void nrminf(RV& R, const XV& X) { \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { \ @@ -72,7 +73,8 @@ namespace Impl { int idx = HostBlas::iamax(N, X.data(), one) - 1; \ R() = IPT::norm(X(idx)); \ } else { \ - NrmInf::nrminf(R, X); \ + NrmInf::nrminf(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -82,6 +84,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct NrmInf< \ + ExecSpace, \ Kokkos::View>, \ Kokkos::View, \ @@ -97,7 +100,7 @@ namespace Impl { typedef typename XV::size_type size_type; \ typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ - static void nrminf(RV& R, const XV& X) { \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { \ @@ -111,7 +114,8 @@ namespace Impl { int idx = HostBlas::iamax(N, X.data(), one) - 1; \ R() = IPT::norm(X(idx)); \ } else { \ - NrmInf::nrminf(R, X); \ + NrmInf::nrminf(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -120,7 +124,8 @@ namespace Impl { #define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) \ template \ - struct NrmInf>, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -137,7 +142,7 @@ namespace Impl { typedef Kokkos::Details::InnerProductSpaceTraits> \ IPT; \ \ - static void nrminf(RV& R, const XV& X) { \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -156,55 +161,58 @@ namespace Impl { 1; \ R() = IPT::norm(X(idx)); \ } else { \ - NrmInf::nrminf(R, X); \ + NrmInf::nrminf(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> \ - IPT; \ - \ - static void nrminf(RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0f; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = \ - HostBlas>::iamax( \ - N, reinterpret_cast*>(X.data()), \ - one) - \ - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits> \ + IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0f; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = \ + HostBlas>::iamax( \ + N, reinterpret_cast*>(X.data()), \ + one) - \ + 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, \ + X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, diff --git a/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp index 5879131808..636d3fe61f 100644 --- a/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct reciprocal_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index d340c6bc02..5c5a6008ec 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_tpl_spec_avail { enum : bool { value = false }; }; @@ -36,6 +37,7 @@ namespace Impl { #define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct scal_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -59,63 +61,71 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct scal_tpl_spec_avail< \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct scal_tpl_spec_avail< \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 8b3d8e6d95..da11555f7b 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -42,6 +42,7 @@ namespace Impl { LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Scal< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR_TYPE, \ @@ -60,7 +61,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void scal(const RV& R, const AS& alpha, const XV& X) { \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ + const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const size_type numElems = X.extent(0); \ @@ -73,7 +75,8 @@ namespace Impl { HostBlas::scal( \ N, alpha_b, reinterpret_cast(R.data()), one); \ } else { \ - Scal::scal(R, alpha, X); \ + Scal::scal(space, R, \ + alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -129,48 +132,54 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ - CUBLAS_FN, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Scal< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const RV& R, const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - } else { \ - Scal::scal(R, alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ + CUBLAS_FN, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Scal< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ + const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && \ + (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Scal::scal(space, R, \ + alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ @@ -248,29 +257,32 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEMSPACE, \ + SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Scal< \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR_TYPE, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ RV; \ typedef SCALAR_TYPE AS; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void scal(const RV& R, const AS& alpha, const XV& X) { \ + static void scal(const execution_space& space, const RV& R, \ + const AS& alpha, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -281,57 +293,68 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ s.handle, N, reinterpret_cast(&alpha), \ reinterpret_cast(R.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, pointer_mode)); \ } else { \ - Scal::scal(R, alpha, X); \ + Scal::scal(space, R, \ + alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ + LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp index 12470c28ed..755fa092fb 100644 --- a/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sum_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp index 76eb7cb37a..88a60e6d19 100644 --- a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp new file mode 100644 index 0000000000..b672c690d5 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -0,0 +1,205 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct ger_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::OpenMP, Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// We use the same layout for X, Y and Abecause the GER interface will +// switch the layouts of X and Y to that of A. So this TPL version will +// match any layout combination, as long as none are LayoutStride. + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp new file mode 100644 index 0000000000..f61e896951 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp new file mode 100644 index 0000000000..3ba437a5a7 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -0,0 +1,342 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ + A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ + A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ + A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ + A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = \ + static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + throw std::runtime_error( \ + "Error: blasZgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = \ + static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + throw std::runtime_error( \ + "Error: blasCgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) +#endif + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp new file mode 100644 index 0000000000..d05b09784e --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -0,0 +1,353 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: cublasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: cublasCgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp new file mode 100644 index 0000000000..c55d091516 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,324 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), \ + one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), \ + one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), \ + one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), \ + one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: rocblasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: rocblasCgec() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index f689ba079c..6f6a7a2e9f 100644 --- a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -19,7 +19,7 @@ #include "KokkosBlas_util.hpp" #include "KokkosBatched_Vector.hpp" -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && !defined(KOKKOS_ENABLE_SYCL) #include "mkl_version.h" #if __INTEL_MKL__ >= 2018 #define __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ 1 diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 25d8818817..69146baf4f 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -14,13 +14,13 @@ // //@HEADER -#ifndef KOKKOSBLAS3_GEMV_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS3_GEMV_TPL_SPEC_AVAIL_HPP_ +#ifndef KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_HPP_ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemm_tpl_spec_avail { enum : bool { value = false }; }; @@ -32,6 +32,7 @@ struct gemm_tpl_spec_avail { LAYOUTC, MEMSPACE) \ template \ struct gemm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -78,6 +79,7 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, LAYOUTC, MEMSPACE) \ template \ struct gemm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -149,6 +151,7 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gemm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 5508b892e7..66177e28a6 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -26,7 +26,8 @@ namespace Impl { #define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct GEMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ CViewType; \ \ - static void gemm(const typename CViewType::execution_space& /* space*/, \ - const char transA[], const char transB[], \ + static void gemm(const ExecSpace& /* space*/, const char transA[], \ + const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -163,7 +164,8 @@ namespace Impl { LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ CViewType; \ \ - static void gemm(const typename CViewType::execution_space& space, \ - const char transA[], const char transB[], \ + static void gemm(const ExecSpace& space, const char transA[], \ + const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -364,7 +366,8 @@ namespace Impl { ROCBLAS_FN, LAYOUT, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM, \ Kokkos::MemoryTraits >, \ Kokkos::View, rocblas_float_complex, \ rocblas_cgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index e44f3a9db7..010b44a154 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -21,7 +21,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trmm_tpl_spec_avail { enum : bool { value = false }; }; @@ -33,6 +33,7 @@ struct trmm_tpl_spec_avail { MEMSPACE) \ template \ struct trmm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -71,6 +72,7 @@ KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, MEMSPACE) \ template \ struct trmm_tpl_spec_avail< \ + Kokkos::Cuda, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 44771da56f..53c73f7416 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -27,7 +27,8 @@ namespace Impl { #define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ BViewType; \ \ - static void trmm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trmm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE \ @@ -168,7 +169,8 @@ namespace Impl { #define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ BViewType; \ \ - static void trmm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trmm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -242,18 +244,24 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_layout_left) \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB); \ - else \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_layout_left) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), \ + LDA, reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), \ + LDA, reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index 2af72d4950..d1836809ec 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -21,7 +21,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trsm_tpl_spec_avail { enum : bool { value = false }; }; @@ -33,6 +33,7 @@ struct trsm_tpl_spec_avail { MEMSPACE) \ template \ struct trsm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -71,6 +72,7 @@ KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, MEMSPACE) \ template \ struct trsm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index 0d2f38ed6e..ec36388094 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -26,6 +26,7 @@ namespace Impl { #define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -42,8 +43,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \ @@ -94,6 +95,7 @@ namespace Impl { #define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -110,8 +112,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \ @@ -161,7 +163,8 @@ namespace Impl { #define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -178,8 +181,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -235,7 +238,8 @@ namespace Impl { #define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -252,8 +256,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -357,6 +361,7 @@ namespace Impl { #define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -373,8 +378,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \ @@ -427,13 +432,18 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB); \ - \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -441,6 +451,7 @@ namespace Impl { #define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -457,8 +468,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \ @@ -511,12 +522,18 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ @@ -524,7 +541,8 @@ namespace Impl { #define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -541,8 +559,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -596,16 +614,22 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ - cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - else \ - cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ + s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ + s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ @@ -613,7 +637,8 @@ namespace Impl { #define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -630,8 +655,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -685,16 +710,22 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - else \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 24276f4a77..37733f609e 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -225,6 +225,30 @@ void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, int*, int*, const std::complex*, /* */ std::complex*, int*); +/// +/// Ger +/// +void F77_BLAS_MANGLE(sger, SGER)(int*, int*, const float*, const float*, int*, + const float*, int*, float*, int*); +void F77_BLAS_MANGLE(dger, DGER)(int*, int*, const double*, const double*, int*, + const double*, int*, double*, int*); +void F77_BLAS_MANGLE(cgeru, CGERU)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zgeru, ZGERU)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); + /// /// Trsv /// @@ -439,6 +463,13 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CGEMV F77_BLAS_MANGLE(cgemv, CGEMV) #define F77_FUNC_ZGEMV F77_BLAS_MANGLE(zgemv, ZGEMV) +#define F77_FUNC_SGER F77_BLAS_MANGLE(sger, SGER) +#define F77_FUNC_DGER F77_BLAS_MANGLE(dger, DGER) +#define F77_FUNC_CGERU F77_BLAS_MANGLE(cgeru, CGERU) +#define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) +#define F77_FUNC_ZGERU F77_BLAS_MANGLE(zgeru, ZGERU) +#define F77_FUNC_ZGERC F77_BLAS_MANGLE(zgerc, ZGERC) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -540,6 +571,12 @@ void HostBlas::gemv(const char trans, int m, int n, const float alpha, F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> +void HostBlas::ger(int m, int n, const float alpha, const float* x, + int incx, const float* y, int incy, float* a, + int lda) { + F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const float* a, int lda, /* */ float* b, int ldb) { @@ -653,6 +690,12 @@ void HostBlas::gemv(const char trans, int m, int n, const double alpha, F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> +void HostBlas::ger(int m, int n, const double alpha, const double* x, + int incx, const double* y, int incy, double* a, + int lda) { + F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const double* a, int lda, /* */ double* b, int ldb) { @@ -768,7 +811,6 @@ void HostBlas >::swap(int const N, std::complex* X, int const incy) { F77_FUNC_CSWAP(&N, X, &incx, Y, &incy); } - template <> void HostBlas >::gemv(const char trans, int m, int n, const std::complex alpha, @@ -782,6 +824,24 @@ void HostBlas >::gemv(const char trans, int m, int n, (std::complex*)c, &ldc); } template <> +void HostBlas >::geru( + int m, int n, const std::complex alpha, const std::complex* x, + int incx, const std::complex* y, int incy, std::complex* a, + int lda) { + F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); +} +template <> +void HostBlas >::gerc( + int m, int n, const std::complex alpha, const std::complex* x, + int incx, const std::complex* y, int incy, std::complex* a, + int lda) { + F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, int lda, @@ -923,7 +983,6 @@ void HostBlas >::swap(int const N, std::complex* X, int const incy) { F77_FUNC_ZSWAP(&N, X, &incx, Y, &incy); } - template <> void HostBlas >::gemv( const char trans, int m, int n, const std::complex alpha, @@ -935,6 +994,24 @@ void HostBlas >::gemv( (std::complex*)c, &ldc); } template <> +void HostBlas >::geru( + int m, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); +} +template <> +void HostBlas >::gerc( + int m, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index da89b5aa5d..cd53537ea6 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -64,6 +64,15 @@ struct HostBlas { int lda, const T *b, int ldb, const T beta, /* */ T *c, int ldc); + static void ger(int m, int n, const T alpha, const T *x, int incx, const T *y, + int incy, T *a, int lda); + + static void geru(int m, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + + static void gerc(int m, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, /* */ T *b, int ldb); diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index cfa2e41c3e..ff955d13a8 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -61,6 +61,7 @@ // Blas 2 #include "Test_Blas2_gemv.hpp" +#include "Test_Blas2_ger.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index ff91087613..8a2c7e3374 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -24,38 +24,13 @@ template void impl_test_abs(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + typedef Kokkos::ArithTraits AT; typename AT::mag_type eps = AT::epsilon() * 10; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Org_Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -63,33 +38,34 @@ void impl_test_abs(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::deep_copy(org_y.h_base, y.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); // Run with nonconst input - KokkosBlas::abs(y, x); + KokkosBlas::abs(y.d_view, x.d_view); // Copy result to host (h_y is subview of h_b_y) - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), + eps * AT::abs(x.h_view(i))); } // Run with const input // Reset output - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::abs(y, c_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::abs(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), + eps * AT::abs(x.h_view(i))); } } @@ -97,26 +73,11 @@ template void impl_test_abs_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; + typedef Kokkos::ArithTraits AT; - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -124,38 +85,38 @@ void impl_test_abs_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); + Kokkos::deep_copy(org_y.h_base, y.d_base); - typename ViewTypeA::const_type c_x = x; + Kokkos::deep_copy(x.h_base, x.d_base); typename AT::mag_type eps = AT::epsilon() * 10; // Test and verify non-const input - KokkosBlas::abs(y, x); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::abs(y.d_view, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), + eps * AT::abs(x.h_view(i, j))); } } // Test and verify const input // Reset y - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::abs(y, c_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::abs(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), + eps * AT::abs(x.h_view(i, j))); } } } @@ -185,8 +146,7 @@ int test_abs() { // Test::impl_test_abs(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -229,8 +189,7 @@ int test_abs_mv() { // Test::impl_test_abs_mv(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index 624bfc9d09..e914c9a19a 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -23,35 +23,20 @@ namespace Test { template void impl_test_asum(int N) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef Kokkos::ArithTraits MAT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - - BaseTypeA b_a("A", N); - - ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0); - - typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0); + view_stride_adapter a("A", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = 0; @@ -61,13 +46,14 @@ void impl_test_asum(int N) { // parts. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + expected_result += + MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } - typename AT::mag_type nonconst_result = KokkosBlas::asum(a); + typename AT::mag_type nonconst_result = KokkosBlas::asum(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::asum(c_a); + typename AT::mag_type const_result = KokkosBlas::asum(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -95,8 +81,7 @@ int test_asum() { // Test::impl_test_asum(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_asum(0); diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 79a244fc6e..0d34464a84 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -27,19 +27,6 @@ void impl_test_axpby(int N) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - using BaseTypeA = Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - using BaseTypeB = Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - ScalarA a = 3; ScalarB b = 5; // eps should probably be based on ScalarB since that is the type @@ -51,22 +38,9 @@ void impl_test_axpby(int N) { Kokkos::ArithTraits::abs(b)) * max_val * eps; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - auto h_b_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Org_Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -74,34 +48,28 @@ void impl_test_axpby(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - Kokkos::deep_copy(h_b_org_y, b_org_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - Kokkos::deep_copy(h_b_x, b_x); - - // Run with non-const input (x) and verify - KokkosBlas::axpby(a, x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + // Run with non-const input and verify + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), - h_y(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), + y.h_view(i), 2 * max_error); } - Kokkos::deep_copy(b_y, b_org_y); - // Run again with const input (c_x) - KokkosBlas::axpby(a, c_x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + // Re-randomize y + Kokkos::deep_copy(y.d_base, org_y.h_base); + // Run again with const input + KokkosBlas::axpby(a, x.d_view_const, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), - h_y(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), + y.h_view(i), 2 * max_error); } } @@ -111,24 +79,9 @@ void impl_test_axpby_mv(int N, int K) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); ScalarA a = 3; ScalarB b = 5; @@ -145,44 +98,39 @@ void impl_test_axpby_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - ViewTypeB org_y = vfB_type::view(b_org_y); - auto h_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - - typename ViewTypeA::const_type c_x = x; + Kokkos::deep_copy(org_y.h_base, y.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); Kokkos::View r("Dot::Result", K); - KokkosBlas::axpby(a, x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::axpby(a, c_x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::axpby(a, x.d_view_const, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } } @@ -212,8 +160,7 @@ int test_axpby() { Test::impl_test_axpby(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -256,8 +203,7 @@ int test_axpby_mv() { Test::impl_test_axpby_mv(132231, 5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 890e116584..8b21ff6dc5 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -27,19 +27,6 @@ void impl_test_axpy(int N) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - using BaseTypeA = Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - using BaseTypeB = Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - ScalarA a = 3; const MagnitudeB max_val = 10; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); @@ -48,20 +35,9 @@ void impl_test_axpy(int N) { max_val) * eps; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Org_Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -69,35 +45,32 @@ void impl_test_axpy(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - auto h_b_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - Kokkos::deep_copy(h_b_x, b_x); - - KokkosBlas::axpy(a, x, y); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::axpy(a, x.d_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - ScalarB expected = a * h_x(i) + h_b_org_y(i, 0); - EXPECT_NEAR_KK(expected, h_y(i), 2 * max_error); + ScalarB expected = a * x.h_view(i) + org_y.h_view(i); + EXPECT_NEAR_KK(expected, y.h_view(i), 2 * max_error); } // reset y to orig, and run again with const-valued x - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::axpy(a, c_x, y); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::axpy(a, x.d_view_const, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - ScalarB expected = a * h_x(i) + h_b_org_y(i, 0); - EXPECT_NEAR_KK(expected, h_y(i), 2 * max_error); + ScalarB expected = a * x.h_view(i) + org_y.h_view(i); + EXPECT_NEAR_KK(expected, y.h_view(i), 2 * max_error); } } @@ -107,24 +80,9 @@ void impl_test_axpy_mv(int N, int K) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); ScalarA a = 3; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); @@ -140,40 +98,36 @@ void impl_test_axpy_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - ViewTypeB org_y = vfB_type::view(b_org_y); - auto h_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); + Kokkos::deep_copy(org_y.h_base, y.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - - typename ViewTypeA::const_type c_x = x; - - KokkosBlas::axpy(a, x, y); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::axpy(a, x.d_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::axpy(a, c_x, y); - Kokkos::deep_copy(h_b_y, b_y); + // reset y to orig, and run again with const-valued x + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::axpy(a, x.d_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } } @@ -203,8 +157,7 @@ int test_axpy() { // Test::impl_test_axpy(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -247,8 +200,7 @@ int test_axpy_mv() { // Test::impl_test_axpy_mv(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index b2dfc1bd41..d978cbafaa 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -27,11 +27,8 @@ void impl_test_dot(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits ats; - ViewTypeA a("a", N); - ViewTypeB b("b", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); - typename ViewTypeB::HostMirror h_b = Kokkos::create_mirror_view(b); + view_stride_adapter a("a", N); + view_stride_adapter b("b", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -39,34 +36,33 @@ void impl_test_dot(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b, rand_pool, randStart, randEnd); + Kokkos::fill_random(b.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(h_a, a); - Kokkos::deep_copy(h_b, b); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) expected_result += ats::conj(h_a(i)) * h_b(i); + for (int i = 0; i < N; i++) + expected_result += ats::conj(a.h_view(i)) * b.h_view(i); - ScalarA nonconst_nonconst_result = KokkosBlas::dot(a, b); + ScalarA nonconst_nonconst_result = KokkosBlas::dot(a.d_view, b.d_view); double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; - ScalarA const_const_result = KokkosBlas::dot(c_a, c_b); + ScalarA const_const_result = KokkosBlas::dot(a.d_view_const, b.d_view_const); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); - ScalarA nonconst_const_result = KokkosBlas::dot(a, c_b); + ScalarA nonconst_const_result = KokkosBlas::dot(a.d_view, b.d_view_const); EXPECT_NEAR_KK(nonconst_const_result, expected_result, eps * expected_result); - ScalarA const_nonconst_result = KokkosBlas::dot(c_a, b); + ScalarA const_nonconst_result = KokkosBlas::dot(a.d_view_const, b.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -76,23 +72,8 @@ void impl_test_dot_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits ats; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfB_type::BaseType b_b("B", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeB b = vfB_type::view(b_b); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b); + view_stride_adapter a("A", N, K); + view_stride_adapter b("B", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -100,32 +81,29 @@ void impl_test_dot_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_b, rand_pool, randStart, randEnd); + Kokkos::fill_random(b.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_b, b_b); - - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA* expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ats::conj(h_a(i, j)) * h_b(i, j); + expected_result[j] += ats::conj(a.h_view(i, j)) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Dot::Result", K); - KokkosBlas::dot(r, a, b); + KokkosBlas::dot(r, a.d_view, b.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); @@ -133,7 +111,7 @@ void impl_test_dot_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::dot(r, c_a, c_b); + KokkosBlas::dot(r, a.d_view_const, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); @@ -141,7 +119,7 @@ void impl_test_dot_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::dot(r, a, c_b); + KokkosBlas::dot(r, a.d_view, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); @@ -149,7 +127,7 @@ void impl_test_dot_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::dot(r, c_a, b); + KokkosBlas::dot(r, a.d_view_const, b.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); @@ -185,26 +163,21 @@ int test_dot() { // Test::impl_test_dot(132231); #endif - // Removing the layout stride test as ViewTypeA a("a", N); - // is invalid since the view constructor needs a stride object! - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_dot(0); - Test::impl_test_dot(13); - Test::impl_test_dot(1024); - // Test::impl_test_dot(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_dot(1024); - Test::impl_test_dot(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_dot(0); + Test::impl_test_dot(13); + Test::impl_test_dot(1024); + // Test::impl_test_dot(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_dot(1024); + Test::impl_test_dot(1024); +#endif return 1; } @@ -235,28 +208,24 @@ int test_dot_mv() { // Test::impl_test_dot_mv(132231,5); #endif - // Removing the layout stride test as ViewTypeA a("a", N); - // is invalid since the view constructor needs a stride object! - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); Test::impl_test_dot_mv(789, 1); - // Test::impl_test_dot_mv(132231,5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(1024, 5); - #endif - */ +// Removing the layout stride test as ViewTypeA a("a", N); +// is invalid since the view constructor needs a stride object! +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_dot_mv(0, 5); + Test::impl_test_dot_mv(13, 5); + Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); + // Test::impl_test_dot_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index ced1759301..fcd896e22a 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -23,29 +23,25 @@ namespace Test { template void impl_test_iamax(int N) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename AT::mag_type mag_type; using size_type = typename ViewTypeA::size_type; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("X", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_a, a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); - mag_type expected_result = Kokkos::Details::ArithTraits::min(); + mag_type expected_result = Kokkos::ArithTraits::min(); size_type expected_max_loc = 0; for (int i = 0; i < N; i++) { - mag_type val = AT::abs(h_a(i)); + mag_type val = AT::abs(a.h_view(i)); if (val > expected_result) { expected_result = val; expected_max_loc = i + 1; @@ -60,10 +56,10 @@ void impl_test_iamax(int N) { { // printf("impl_test_iamax -- return result as a scalar on host -- N %d\n", // N); - size_type nonconst_max_loc = KokkosBlas::iamax(a); + size_type nonconst_max_loc = KokkosBlas::iamax(a.d_view); ASSERT_EQ(nonconst_max_loc, expected_max_loc); - size_type const_max_loc = KokkosBlas::iamax(c_a); + size_type const_max_loc = KokkosBlas::iamax(a.d_view_const); ASSERT_EQ(const_max_loc, expected_max_loc); } @@ -73,14 +69,15 @@ void impl_test_iamax(int N) { typedef Kokkos::View ViewType0D; - ViewType0D r("Iamax::Result 0-D View on host"); + ViewType0D r("Iamax::Result 0-D View on host", + typename ViewTypeA::array_layout()); - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); size_type nonconst_max_loc = r(); ASSERT_EQ(nonconst_max_loc, expected_max_loc); - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); size_type const_max_loc = r(); ASSERT_EQ(const_max_loc, expected_max_loc); } @@ -90,19 +87,20 @@ void impl_test_iamax(int N) { // %d\n", N); typedef Kokkos::View ViewType0D; - ViewType0D r("Iamax::Result 0-D View on device"); + ViewType0D r("Iamax::Result 0-D View on device", + typename ViewTypeA::array_layout()); typename ViewType0D::HostMirror h_r = Kokkos::create_mirror_view(r); size_type nonconst_max_loc, const_max_loc; - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::deep_copy(h_r, r); nonconst_max_loc = h_r(); ASSERT_EQ(nonconst_max_loc, expected_max_loc); - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); Kokkos::deep_copy(h_r, r); const_max_loc = h_r(); @@ -114,40 +112,28 @@ void impl_test_iamax(int N) { template void impl_test_iamax_mv(int N, int K) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename AT::mag_type mag_type; typedef typename ViewTypeA::size_type size_type; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); mag_type* expected_result = new mag_type[K]; size_type* expected_max_loc = new size_type[K]; for (int j = 0; j < K; j++) { - expected_result[j] = Kokkos::Details::ArithTraits::min(); + expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { - mag_type val = AT::abs(h_a(i, j)); + mag_type val = AT::abs(a.h_view(i, j)); if (val > expected_result[j]) { expected_result[j] = val; expected_max_loc[j] = i + 1; @@ -162,11 +148,13 @@ void impl_test_iamax_mv(int N, int K) { { // printf("impl_test_iamax_mv -- return results as a 1-D View on host -- N // %d\n", N); + Kokkos::View rcontig( + "Iamax::Result View on host", K); Kokkos::View - r("Iamax::Result View on host", K); + r = rcontig; - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { @@ -175,7 +163,7 @@ void impl_test_iamax_mv(int N, int K) { ASSERT_EQ(nonconst_result, exp_result); } - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { @@ -188,13 +176,14 @@ void impl_test_iamax_mv(int N, int K) { { // printf("impl_test_iamax_mv -- return results as a 1-D View on device -- N // %d\n", N); - Kokkos::View r( - "Iamax::Result View on device", K); + Kokkos::View rcontig("Iamax::Result View on host", K); + Kokkos::View r = + rcontig; typename Kokkos::View::HostMirror h_r = - Kokkos::create_mirror_view(r); + Kokkos::create_mirror_view(rcontig); - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::deep_copy(h_r, r); for (int k = 0; k < K; k++) { @@ -203,7 +192,7 @@ void impl_test_iamax_mv(int N, int K) { ASSERT_EQ(nonconst_result, exp_result); } - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); Kokkos::deep_copy(h_r, r); for (int k = 0; k < K; k++) { @@ -240,17 +229,14 @@ int test_iamax() { // Test::impl_test_iamax(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_iamax(0); - Test::impl_test_iamax(13); - Test::impl_test_iamax(1024); - // Test::impl_test_iamax(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_iamax(0); + Test::impl_test_iamax(13); + Test::impl_test_iamax(1024); + // Test::impl_test_iamax(132231); +#endif return 1; } @@ -277,17 +263,14 @@ int test_iamax_mv() { // Test::impl_test_iamax_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_iamax_mv(0, 5); - Test::impl_test_iamax_mv(13, 5); - Test::impl_test_iamax_mv(1024, 5); - // Test::impl_test_iamax_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_iamax_mv(0, 5); + Test::impl_test_iamax_mv(13, 5); + Test::impl_test_iamax_mv(1024, 5); + // Test::impl_test_iamax_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index 3c027f26e7..0888c7a6b2 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -31,17 +31,10 @@ void impl_test_mult(int N) { ScalarB b = 5; double eps = std::is_same::value ? 1e-4 : 1e-7; - ViewTypeA x("X", N); - ViewTypeB y("Y", N); - ViewTypeC z("Y", N); - ViewTypeC b_org_z("Org_Z", N); - - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeB::HostMirror h_y = Kokkos::create_mirror_view(y); - typename ViewTypeC::HostMirror h_z = Kokkos::create_mirror_view(z); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -49,49 +42,48 @@ void impl_test_mult(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); - - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(org_z.h_base, z.d_base); - // expected_result = ScalarC(b*h_z(i) + a*h_x(i)*h_y(i)) + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - KokkosBlas::mult(b, z, a, x, y); - Kokkos::deep_copy(h_z, z); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), - h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + + b * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(z, b_org_z); - KokkosBlas::mult(b, z, a, x, c_y); - Kokkos::deep_copy(h_z, z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view_const); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), - h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + + b * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(z, b_org_z); - KokkosBlas::mult(b, z, a, c_x, c_y); - Kokkos::deep_copy(h_z, z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::mult(b, z.d_view, a, x.d_view_const, y.d_view_const); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), - h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + + b * org_z.h_view(i)), + z.h_view(i), eps); } } @@ -101,26 +93,11 @@ void impl_test_mult_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - ViewTypeA x("X", N); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + // x is rank-1, all others are rank-2 + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -128,52 +105,46 @@ void impl_test_mult_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, b_z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); - - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; double eps = std::is_same::value ? 1e-4 : 1e-7; - KokkosBlas::mult(b, z, a, x, y); - Kokkos::deep_copy(h_b_z, b_z); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + + b * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::mult(b, z, a, x, c_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view_const); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + + b * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } } @@ -213,27 +184,28 @@ int test_mult() { // Device>(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult( 0); Test::impl_test_mult( 13); Test::impl_test_mult( 1024); - // Test::impl_test_mult(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult( 1024); Test::impl_test_mult( 1024); #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_mult( + 0); + Test::impl_test_mult( + 13); + Test::impl_test_mult( + 1024); + // Test::impl_test_mult(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult( + 1024); + Test::impl_test_mult( + 1024); +#endif return 1; } @@ -272,30 +244,28 @@ int test_mult_mv() { // Device>(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View - view_type_b_ls; typedef Kokkos::View - view_type_c_ls; Test::impl_test_mult_mv(0, 5); Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); - // Test::impl_test_mult_mv(132231,5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult_mv(1024, 5); - Test::impl_test_mult_mv(1024, 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); + // Test::impl_test_mult_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index b64aab9c3c..5c99895a49 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -27,20 +27,17 @@ void impl_test_nrm1(int N) { typedef typename AT::mag_type mag_type; typedef Kokkos::ArithTraits MAT; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("a", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = (std::is_same::mag_type, float>::value ? 1e-4 @@ -53,45 +50,34 @@ void impl_test_nrm1(int N) { // parts. See netlib, MKL, and CUBLAS documentation. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + expected_result += + MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } - mag_type nonconst_result = KokkosBlas::nrm1(a); + mag_type nonconst_result = KokkosBlas::nrm1(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - mag_type const_result = KokkosBlas::nrm1(c_a); + mag_type const_result = KokkosBlas::nrm1(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } template void impl_test_nrm1_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename AT::mag_type mag_type; typedef Kokkos::ArithTraits MAT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); double eps = (std::is_same::mag_type, float>::value @@ -103,20 +89,19 @@ void impl_test_nrm1_mv(int N, int K) { for (int k = 0; k < K; k++) { expected_result(k) = MAT::zero(); for (int i = 0; i < N; i++) { - expected_result(k) += - MAT::abs(AT::real(h_a(i, k))) + MAT::abs(AT::imag(h_a(i, k))); + expected_result(k) += MAT::abs(AT::real(a.h_view(i, k))) + + MAT::abs(AT::imag(a.h_view(i, k))); } } Kokkos::View r("Nrm1::Result", K); Kokkos::View c_r("Nrm1::ConstResult", K); - KokkosBlas::nrm1(r, a); - KokkosBlas::nrm1(c_r, a); + KokkosBlas::nrm1(r, a.d_view); + KokkosBlas::nrm1(c_r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { EXPECT_NEAR_KK(r(k), expected_result(k), eps * expected_result(k)); - EXPECT_NEAR_KK(c_r(k), expected_result(k), eps * expected_result(k)); } } } // namespace Test @@ -143,17 +128,14 @@ int test_nrm1() { Test::impl_test_nrm1(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm1(0); - Test::impl_test_nrm1(13); - Test::impl_test_nrm1(1024); - Test::impl_test_nrm1(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm1(0); + Test::impl_test_nrm1(13); + Test::impl_test_nrm1(1024); + Test::impl_test_nrm1(132231); +#endif return 1; } @@ -182,8 +164,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(132231, 5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm1_mv(0, 5); diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index d17c9af505..1264cfecf2 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -23,82 +23,66 @@ namespace Test { template void impl_test_nrm2(int N) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("a", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { - expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i)); + expected_result += AT::abs(a.h_view(i)) * AT::abs(a.h_view(i)); } - expected_result = Kokkos::Details::ArithTraits::sqrt( - expected_result); + expected_result = + Kokkos::ArithTraits::sqrt(expected_result); - typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::nrm2(c_a); + typename AT::mag_type const_result = KokkosBlas::nrm2(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } template void impl_test_nrm2_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typedef Kokkos::ArithTraits AT; - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); - - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); + expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); } expected_result[j] = - Kokkos::Details::ArithTraits::sqrt( - expected_result[j]); + Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2(r, a); + KokkosBlas::nrm2(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); @@ -106,7 +90,7 @@ void impl_test_nrm2_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::nrm2(r, c_a); + KokkosBlas::nrm2(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); @@ -139,17 +123,14 @@ int test_nrm2() { // Test::impl_test_nrm2(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2(0); - Test::impl_test_nrm2(13); - Test::impl_test_nrm2(1024); - // Test::impl_test_nrm2(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2(0); + Test::impl_test_nrm2(13); + Test::impl_test_nrm2(1024); + // Test::impl_test_nrm2(132231); +#endif return 1; } @@ -178,18 +159,15 @@ int test_nrm2_mv() { // Test::impl_test_nrm2_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrm2_mv(0, 5); - Test::impl_test_nrm2_mv(13, 5); - Test::impl_test_nrm2_mv(1024, 5); - Test::impl_test_nrm2_mv(789, 1); - // Test::impl_test_nrm2_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2_mv(0, 5); + Test::impl_test_nrm2_mv(13, 5); + Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); + // Test::impl_test_nrm2_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index ebebd57b9a..c218a12d39 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -23,81 +23,54 @@ namespace Test { template void impl_test_nrm2_squared(int N) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - - BaseTypeA b_a("A", N); - - ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0); - - typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0); + view_stride_adapter a("a", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result(0); for (int i = 0; i < N; i++) { - expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i)); + expected_result += AT::abs(a.h_view(i)) * AT::abs(a.h_view(i)); } - typename AT::mag_type nonconst_result = KokkosBlas::nrm2_squared(a); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2_squared(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::nrm2_squared(c_a); + typename AT::mag_type const_result = KokkosBlas::nrm2_squared(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } template void impl_test_nrm2_squared_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; + typedef Kokkos::ArithTraits AT; - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); + expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); } } @@ -107,7 +80,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2_squared(r, a); + KokkosBlas::nrm2_squared(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); @@ -118,7 +91,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { EXPECT_NEAR_KK(diff, zero, eps); } - KokkosBlas::nrm2_squared(r, c_a); + KokkosBlas::nrm2_squared(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); @@ -155,8 +128,7 @@ int test_nrm2_squared() { // Test::impl_test_nrm2_squared(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared(0); @@ -192,8 +164,7 @@ int test_nrm2_squared_mv() { // Test::impl_test_nrm2_squared_mv(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared_mv(0, 5); diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index b91c5fbf78..89c1bdad45 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -22,93 +22,87 @@ namespace Test { template void impl_test_nrm2w(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; - ViewTypeA a("A", N); - ViewTypeA w("W", N); + view_stride_adapter a("A", N); + view_stride_adapter w("W", N); - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); - typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = + max_val * std::sqrt(static_cast(N)) * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); - Kokkos::fill_random(w, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), + randEnd); // Avoid divide by 0 - Kokkos::deep_copy(h_a, a); - Kokkos::deep_copy(h_w, w); - - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + typename AT::mag_type term = AT::abs(a.h_view(i)) / AT::abs(w.h_view(i)); expected_result += term * term; } expected_result = Kokkos::ArithTraits::sqrt(expected_result); - typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a, w); - EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a.d_view, w.d_view); + EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } template void impl_test_nrm2w_mv(int N, int K) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfA_type::BaseType b_w("W", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeA w = vfA_type::view(b_w); - - typedef multivector_layout_adapter h_vfA_type; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + view_stride_adapter a("A", N, K); + view_stride_adapter w("W", N, K); - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = + max_val * std::sqrt(static_cast(N)) * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), + randEnd); // Avoid dividing by 0 - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_w, b_w); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + typename AT::mag_type term = + AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2w(r, a, w); + KokkosBlas::nrm2w(r, a.d_view, w.d_view); auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r_host(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], max_error); } delete[] expected_result; @@ -137,17 +131,14 @@ int test_nrm2w() { // Test::impl_test_nrm2(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w(0); - Test::impl_test_nrm2w(13); - Test::impl_test_nrm2w(1024); - // Test::impl_test_nrm2(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif return 1; } @@ -176,18 +167,15 @@ int test_nrm2w_mv() { // Test::impl_test_nrm2w_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrm2w_mv(0, 5); - Test::impl_test_nrm2w_mv(13, 5); - Test::impl_test_nrm2w_mv(1024, 5); - Test::impl_test_nrm2w_mv(789, 1); - // Test::impl_test_nrm2w_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 59661cc7e5..bacc733b1a 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -22,89 +22,81 @@ namespace Test { template void impl_test_nrm2w_squared(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; - ViewTypeA a("A", N); - ViewTypeA w("W", N); + view_stride_adapter a("A", N); + view_stride_adapter w("W", N); - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); - typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = max_val * max_val * N * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); - Kokkos::fill_random(w, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), + randEnd); // Avoid divide by 0 - Kokkos::deep_copy(h_a, a); - Kokkos::deep_copy(h_w, w); - - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + typename AT::mag_type term = AT::abs(a.h_view(i)) / AT::abs(w.h_view(i)); expected_result += term * term; } - typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a, w); - EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); + typename AT::mag_type nonconst_result = + KokkosBlas::nrm2w_squared(a.d_view, w.d_view); + EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } template void impl_test_nrm2w_squared_mv(int N, int K) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfA_type::BaseType b_w("W", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeA w = vfA_type::view(b_w); - - typedef multivector_layout_adapter h_vfA_type; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + view_stride_adapter a("A", N, K); + view_stride_adapter w("W", N, K); - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = max_val * max_val * N * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), randEnd); - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_w, b_w); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + typename AT::mag_type term = + AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } } - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2w_squared(r, a, w); + KokkosBlas::nrm2w_squared(r, a.d_view, w.d_view); auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r_host(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], max_error); } delete[] expected_result; @@ -133,17 +125,14 @@ int test_nrm2w_squared() { // Test::impl_test_nrm2(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w_squared(0); - Test::impl_test_nrm2w_squared(13); - Test::impl_test_nrm2w_squared(1024); - // Test::impl_test_nrm2(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif return 1; } @@ -172,18 +161,15 @@ int test_nrm2w_squared_mv() { // Test::impl_test_nrm2w_squared_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrm2w_squared_mv(0, - 5); Test::impl_test_nrm2w_squared_mv(13, 5); - Test::impl_test_nrm2w_squared_mv(1024, 5); - Test::impl_test_nrm2w_squared_mv(789, 1); - // Test::impl_test_nrm2w_squared_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 8da5550afa..438db16895 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -23,73 +23,58 @@ namespace Test { template void impl_test_nrminf(int N) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("A", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = - Kokkos::Details::ArithTraits::min(); + Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) - if (AT::abs(h_a(i)) > expected_result) expected_result = AT::abs(h_a(i)); + if (AT::abs(a.h_view(i)) > expected_result) + expected_result = AT::abs(a.h_view(i)); if (N == 0) expected_result = typename AT::mag_type(0); - typename AT::mag_type nonconst_result = KokkosBlas::nrminf(a); + typename AT::mag_type nonconst_result = KokkosBlas::nrminf(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::nrminf(c_a); + typename AT::mag_type const_result = KokkosBlas::nrminf(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } template void impl_test_nrminf_mv(int N, int K) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typedef Kokkos::ArithTraits AT; - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); - - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { - expected_result[j] = - Kokkos::Details::ArithTraits::min(); + expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { - if (AT::abs(h_a(i, j)) > expected_result[j]) - expected_result[j] = AT::abs(h_a(i, j)); + if (AT::abs(a.h_view(i, j)) > expected_result[j]) + expected_result[j] = AT::abs(a.h_view(i, j)); } if (N == 0) expected_result[j] = typename AT::mag_type(0); } @@ -98,14 +83,14 @@ void impl_test_nrminf_mv(int N, int K) { Kokkos::View r("Dot::Result", K); - KokkosBlas::nrminf(r, a); + KokkosBlas::nrminf(r, a.d_view); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); typename AT::mag_type exp_result = expected_result[k]; EXPECT_NEAR_KK(nonconst_result, exp_result, eps * exp_result); } - KokkosBlas::nrminf(r, c_a); + KokkosBlas::nrminf(r, a.d_view_const); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); typename AT::mag_type exp_result = expected_result[k]; @@ -137,17 +122,14 @@ int test_nrminf() { // Test::impl_test_nrminf(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrminf(0); - Test::impl_test_nrminf(13); - Test::impl_test_nrminf(1024); - // Test::impl_test_nrminf(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrminf(0); + Test::impl_test_nrminf(13); + Test::impl_test_nrminf(1024); + // Test::impl_test_nrminf(132231); +#endif return 1; } @@ -174,17 +156,14 @@ int test_nrminf_mv() { // Test::impl_test_nrminf_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrminf_mv(0, 5); - Test::impl_test_nrminf_mv(13, 5); - Test::impl_test_nrminf_mv(1024, 5); - // Test::impl_test_nrminf_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrminf_mv(0, 5); + Test::impl_test_nrminf_mv(13, 5); + Test::impl_test_nrminf_mv(1024, 5); + // Test::impl_test_nrminf_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 257429ac0d..841725e6fd 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -23,168 +23,91 @@ namespace Test { template void impl_test_reciprocal(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - - typename AT::mag_type eps = AT::epsilon() * 2000; - typename AT::mag_type zero = AT::abs(AT::zero()); - typename AT::mag_type one = AT::abs(AT::one()); - - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; + + const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); + const MagnitudeA one = AT::abs(AT::one()); + const MagnitudeA max_val = 10; + + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); { ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, one, randEnd); } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); - } - - Kokkos::deep_copy(b_org_y, b_y); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); - ScalarA expected_result(0); - for (int i = 0; i < N; i++) { - expected_result += - AT::abs(AT::one() / h_x(i)) * AT::abs(AT::one() / h_x(i)); + KokkosBlas::reciprocal(y.d_view, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK(y.h_view(i), ScalarB(one / x.h_view(i)), 2 * eps); } - KokkosBlas::reciprocal(y, x); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); - typename AT::mag_type divisor = - AT::abs(expected_result) == zero ? one : AT::abs(expected_result); - typename AT::mag_type diff = - AT::abs(nonconst_nonconst_result - expected_result) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); - - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::reciprocal(y, c_x); - ScalarB const_nonconst_result = KokkosBlas::dot(y, y); - diff = AT::abs(const_nonconst_result - expected_result) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + + KokkosBlas::reciprocal(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK(y.h_view(i), ScalarB(one / x.h_view(i)), 2 * eps); + } } template void impl_test_reciprocal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); { ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Test::getRandomBounds(10, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, + Kokkos::ArithTraits::one(), randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); - typename ViewTypeA::const_type c_x = x; + KokkosBlas::reciprocal(y.d_view, x.d_view); - ScalarA* expected_result = new ScalarA[K]; - for (int j = 0; j < K; j++) { - expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) { - expected_result[j] += - AT::abs(AT::one() / h_x(i, j)) * AT::abs(AT::one() / h_x(i, j)); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int j = 0; j < K; ++j) { + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK( + y.h_view(i, j), + Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } - typename AT::mag_type eps = AT::epsilon() * 2000; - typename AT::mag_type zero = AT::abs(AT::zero()); - typename AT::mag_type one = AT::abs(AT::one()); - - Kokkos::View r("Dot::Result", K); - - KokkosBlas::reciprocal(y, x); - KokkosBlas::dot(r, y, y); - for (int k = 0; k < K; k++) { - ScalarA nonconst_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_result - expected_result[k]) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); - } + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::reciprocal(y, c_x); - KokkosBlas::dot(r, y, y); - for (int k = 0; k < K; k++) { - ScalarA const_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_result - expected_result[k]) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); + KokkosBlas::reciprocal(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); + for (int j = 0; j < K; j++) { + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK( + y.h_view(i, j), + Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); + } } - - delete[] expected_result; } } // namespace Test @@ -212,24 +135,21 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_reciprocal(0); - Test::impl_test_reciprocal(13); - Test::impl_test_reciprocal(1024); - // Test::impl_test_reciprocal(132231); #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal(1024); - Test::impl_test_reciprocal(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_reciprocal(0); + Test::impl_test_reciprocal(13); + Test::impl_test_reciprocal(1024); + // Test::impl_test_reciprocal(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal(1024); + Test::impl_test_reciprocal(1024); +#endif return 1; } @@ -262,28 +182,25 @@ int test_reciprocal_mv() { // Device>(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); Test::impl_test_reciprocal_mv(1024, 5); - // Test::impl_test_reciprocal_mv(132231,5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal_mv(1024, - 5); - Test::impl_test_reciprocal_mv(1024, - 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_reciprocal_mv(0, 5); + Test::impl_test_reciprocal_mv(13, 5); + Test::impl_test_reciprocal_mv(1024, + 5); + // Test::impl_test_reciprocal_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal_mv(1024, + 5); + Test::impl_test_reciprocal_mv(1024, + 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 1c572073a5..6c4f7b7f2a 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -25,20 +25,13 @@ template void impl_test_scal(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; ScalarA a(3); typename AT::mag_type eps = AT::epsilon() * 1000; - ViewTypeA x("X", N); - ViewTypeB y("Y", N); - ViewTypeB org_y("Org_Y", N); - - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeB::HostMirror h_y = Kokkos::create_mirror_view(y); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -46,29 +39,23 @@ void impl_test_scal(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(org_y, y); - - Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(x.h_base, x.d_base); - KokkosBlas::scal(y, a, x); - Kokkos::deep_copy(h_y, y); + KokkosBlas::scal(y.d_view, a, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i)), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i)), y.h_view(i), eps); } - Kokkos::deep_copy(y, org_y); - KokkosBlas::scal(y, a, c_x); - Kokkos::deep_copy(h_y, y); + // Zero out y again and run with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, a, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i)), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i)), y.h_view(i), eps); } } @@ -76,26 +63,10 @@ template void impl_test_scal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); + typedef Kokkos::ArithTraits AT; - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -103,41 +74,34 @@ void impl_test_scal_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } - Kokkos::fence(); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA a(3.0); - typename ViewTypeA::const_type c_x = x; typename AT::mag_type eps = AT::epsilon() * 1000; Kokkos::View r("Dot::Result", K); - KokkosBlas::scal(y, a, x); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::scal(y.d_view, a, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j)), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), + eps); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::scal(y, a, c_x); - Kokkos::deep_copy(h_b_y, b_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, a, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j)), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), + eps); } } @@ -152,22 +116,23 @@ void impl_test_scal_mv(int N, int K) { auto h_params = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params); - KokkosBlas::scal(y, params, x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, params, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * h_x(i, j)), h_y(i, j), - eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), + y.h_view(i, j), eps); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::scal(y, params, c_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, params, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * h_x(i, j)), h_y(i, j), - eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), + y.h_view(i, j), eps); } } } @@ -197,24 +162,21 @@ int test_scal() { // Test::impl_test_scal(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_scal(0); - Test::impl_test_scal(13); - Test::impl_test_scal(1024); - // Test::impl_test_scal(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_scal(1024); - Test::impl_test_scal(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_scal(0); + Test::impl_test_scal(13); + Test::impl_test_scal(1024); + // Test::impl_test_scal(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_scal(1024); + Test::impl_test_scal(1024); +#endif return 1; } @@ -243,25 +205,21 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; Test::impl_test_scal_mv(0, 5); Test::impl_test_scal_mv(13, 5); Test::impl_test_scal_mv(1024, 5); - // Test::impl_test_scal_mv(132231,5); #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_scal_mv(1024, 5); - Test::impl_test_scal_mv(1024, 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_scal_mv(0, 5); + Test::impl_test_scal_mv(13, 5); + Test::impl_test_scal_mv(1024, 5); + // Test::impl_test_scal_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_scal_mv(1024, 5); + Test::impl_test_scal_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index bb33aa451a..80a0561d60 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -99,7 +99,7 @@ template ats; + typedef Kokkos::ArithTraits ats; /// radomized input testing views const ScalarType alpha = 11.1; diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index 4472f8d204..cf119cbd00 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -24,29 +24,26 @@ template void impl_test_sum(int N) { typedef typename ViewTypeA::value_type ScalarA; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("A", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; ScalarA expected_result = 0; - for (int i = 0; i < N; i++) expected_result += h_a(i); + for (int i = 0; i < N; i++) expected_result += a.h_view(i); - ScalarA nonconst_result = KokkosBlas::sum(a); + ScalarA nonconst_result = KokkosBlas::sum(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - ScalarA const_result = KokkosBlas::sum(c_a); + ScalarA const_result = KokkosBlas::sum(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -54,40 +51,28 @@ template void impl_test_sum_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); - - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); ScalarA* expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) expected_result[j] += h_a(i, j); + for (int i = 0; i < N; i++) expected_result[j] += a.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Sum::Result", K); - KokkosBlas::sum(r, a); + KokkosBlas::sum(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); @@ -95,7 +80,7 @@ void impl_test_sum_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::sum(r, c_a); + KokkosBlas::sum(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_result = r(k); @@ -128,17 +113,14 @@ int test_sum() { // Test::impl_test_sum(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_sum(0); - Test::impl_test_sum(13); - Test::impl_test_sum(1024); - // Test::impl_test_sum(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_sum(0); + Test::impl_test_sum(13); + Test::impl_test_sum(1024); + // Test::impl_test_sum(132231); +#endif return 1; } @@ -167,18 +149,15 @@ int test_sum_mv() { // Test::impl_test_sum_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_sum_mv(0, 5); - Test::impl_test_sum_mv(13, 5); - Test::impl_test_sum_mv(1024, 5); - Test::impl_test_sum_mv(789, 1); - // Test::impl_test_sum_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_sum_mv(0, 5); + Test::impl_test_sum_mv(13, 5); + Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); + // Test::impl_test_sum_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index 318f04c58e..d3f4f661d0 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -39,53 +39,24 @@ void impl_test_team_abs(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + typedef Kokkos::ArithTraits AT; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += AT::abs(h_x(i)) * AT::abs(h_x(i)); + expected_result += AT::abs(x.h_view(i)) * AT::abs(x.h_view(i)); // KokkosBlas::abs(y,x); Kokkos::parallel_for( @@ -95,20 +66,23 @@ void impl_test_team_abs(int N) { KokkosBlas::Experimental::abs( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( @@ -118,16 +92,18 @@ void impl_test_team_abs(int N) { KokkosBlas::Experimental::abs( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = KokkosBlas::dot(y, y); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -141,45 +117,24 @@ void impl_test_team_abs_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); + typedef Kokkos::ArithTraits AT; - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); - typename ViewTypeA::const_type c_x = x; + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(h_x(i, j)) * AT::abs(h_x(i, j)); + expected_result[j] += AT::abs(x.h_view(i, j)) * AT::abs(x.h_view(i, j)); } // double eps = std::is_same::value?2*1e-5:1e-7; @@ -195,11 +150,11 @@ void impl_test_team_abs_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - Kokkos::subview(x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); typename AT::mag_type divisor = @@ -211,7 +166,8 @@ void impl_test_team_abs_mv(int N, int K) { // eps*expected_result[k]); } - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( @@ -219,11 +175,11 @@ void impl_test_team_abs_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - Kokkos::subview(c_x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_result = r(k); typename AT::mag_type divisor = @@ -263,8 +219,7 @@ int test_team_abs() { // Test::impl_test_team_abs(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -309,8 +264,7 @@ int test_team_abs_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index e776085a66..e11b1e14a5 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -40,55 +40,28 @@ void impl_test_team_axpby(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - ScalarA a = 3; ScalarB b = 5; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarB(a * h_x(i) + b * h_y(i)) * ScalarB(a * h_x(i) + b * h_y(i)); + expected_result += ScalarB(a * x.h_view(i) + b * y.h_view(i)) * + ScalarB(a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( @@ -98,21 +71,23 @@ void impl_test_team_axpby(int N) { KokkosBlas::Experimental::axpby( teamMember, a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( @@ -122,17 +97,20 @@ void impl_test_team_axpby(int N) { KokkosBlas::Experimental::axpby( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = KokkosBlas::dot(c_y, c_y); + ScalarB const_nonconst_result = + KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -147,53 +125,36 @@ void impl_test_team_axpby_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - typename ViewTypeA::const_type c_x = x; + ScalarA a = 3; + ScalarB b = 5; ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * h_x(i, j) + b * h_y(i, j)) * - ScalarB(a * h_x(i, j) + b * h_y(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)) * + ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Dot::Result", K); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( @@ -201,11 +162,11 @@ void impl_test_team_axpby_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( - teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result), @@ -213,7 +174,7 @@ void impl_test_team_axpby_mv(int N, int K) { AT::abs(expected_result[k] * eps)); } - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( @@ -221,11 +182,12 @@ void impl_test_team_axpby_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( - teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]), @@ -260,8 +222,7 @@ int test_team_axpby() { // Test::impl_test_team_axpby(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -306,8 +267,7 @@ int test_team_axpby_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index d861da45eb..5cff9d025e 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -40,54 +40,27 @@ void impl_test_team_axpy(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Y", N); ScalarA a = 3; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarB(a * h_x(i) + h_y(i)) * ScalarB(a * h_x(i) + h_y(i)); + expected_result += ScalarB(a * x.h_view(i) + y.h_view(i)) * + ScalarB(a * x.h_view(i) + y.h_view(i)); // KokkosBlas::axpy(a,x,y); Kokkos::parallel_for( @@ -97,20 +70,22 @@ void impl_test_team_axpy(int N) { KokkosBlas::Experimental::axpy( teamMember, a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( @@ -120,16 +95,19 @@ void impl_test_team_axpy(int N) { KokkosBlas::Experimental::axpy( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = KokkosBlas::dot(c_y, c_y); + ScalarB const_nonconst_result = + KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -144,45 +122,28 @@ void impl_test_team_axpy_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - ScalarA a = 3; - typename ViewTypeA::const_type c_x = x; + ScalarA a = 3; ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * h_x(i, j) + h_y(i, j)) * - ScalarB(a * h_x(i, j) + h_y(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j) + y.h_view(i, j)) * + ScalarB(a * x.h_view(i, j) + y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -195,18 +156,18 @@ void impl_test_team_axpy_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( - teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( @@ -214,11 +175,12 @@ void impl_test_team_axpy_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( - teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(const_non_const_result, expected_result[k], @@ -253,8 +215,7 @@ int test_team_axpy() { // Test::impl_test_team_axpy(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -299,8 +260,7 @@ int test_team_axpy_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index 2de9ad8a7a..00c0940023 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -39,44 +39,20 @@ void impl_test_team_dot(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - - BaseTypeA b_a("A", N); - BaseTypeB b_b("B", N); - - ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0); - ViewTypeB b = Kokkos::subview(b_b, Kokkos::ALL(), 0); - - typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a); - typename BaseTypeB::HostMirror h_b_b = Kokkos::create_mirror_view(b_b); - - typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_b = Kokkos::subview(h_b_b, Kokkos::ALL(), 0); + view_stride_adapter a("a", N); + view_stride_adapter b("b", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_a, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_b, rand_pool, ScalarB(10)); + Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_b, b_b); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) expected_result += h_a(i) * h_b(i); + for (int i = 0; i < N; i++) expected_result += a.h_view(i) * b.h_view(i); Kokkos::View r("PartialDots", M); Kokkos::View d_r("PartialDots", M); @@ -91,13 +67,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_nonconst_result += r(k); @@ -106,10 +84,6 @@ void impl_test_team_dot(int N) { EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; - - // ScalarA const_const_result = KokkosBlas::dot(c_a,c_b); ScalarA const_const_result = 0; Kokkos::parallel_for( @@ -119,13 +93,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - c_a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_const_result += r(k); @@ -142,13 +118,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_const_result += r(k); @@ -165,13 +143,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - c_a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_nonconst_result += r(k); @@ -190,40 +170,23 @@ void impl_test_team_dot_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfB_type::BaseType b_b("B", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeB b = vfB_type::view(b_b); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b); + view_stride_adapter a("A", N, K); + view_stride_adapter b("B", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_a, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_b, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_b, b_b); + Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) expected_result[j] += h_a(i, j) * h_b(i, j); + for (int i = 0; i < N; i++) + expected_result[j] += a.h_view(i, j) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -237,8 +200,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId), - Kokkos::subview(b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -253,8 +216,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId), - Kokkos::subview(c_b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -269,8 +232,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId), - Kokkos::subview(c_b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -285,8 +248,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId), - Kokkos::subview(b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -323,8 +286,7 @@ int test_team_dot() { // Test::impl_test_team_dot(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -369,8 +331,7 @@ int test_team_dot_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index da8c836130..f340ac2309 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -41,68 +41,33 @@ void impl_test_team_mult(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typedef Kokkos::View< - ScalarC * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeC; - ScalarA a = 3; ScalarB b = 5; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeC b_z("Y", N); - BaseTypeC b_org_z("Org_Z", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - ViewTypeC z = Kokkos::subview(b_z, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(10)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(z.h_base, z.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarC(b * h_z(i) + a * h_x(i) * h_y(i)) * - ScalarC(b * h_z(i) + a * h_x(i) * h_y(i)); + expected_result += + ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)) * + ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)); // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( @@ -112,24 +77,28 @@ void impl_test_team_mult(int N) { KokkosBlas::Experimental::mult( teamMember, b, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC nonconst_nonconst_result = KokkosBlas::dot(z, z); + ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + // Reset z on device to orig and run again with const-valued y + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, @@ -138,23 +107,27 @@ void impl_test_team_mult(int N) { KokkosBlas::Experimental::mult( teamMember, b, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_nonconst_result = KokkosBlas::dot(z, z); + ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + // Reset z again to orig, and run with both x and y const + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,c_x,c_y); Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, @@ -163,20 +136,23 @@ void impl_test_team_mult(int N) { KokkosBlas::Experimental::mult( teamMember, b, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_const_result = KokkosBlas::dot(z, z); + ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); } @@ -192,65 +168,37 @@ void impl_test_team_mult_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - BaseTypeA b_x("X", N); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + // x is rank-1, all others are rank-2 + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + typename Kokkos::ArithTraits::mag_type const max_val = 10; + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(max_val)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(max_val)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(max_val)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; - ScalarC *expected_result = new ScalarC[K]; - for (int j = 0; j < K; j++) { - expected_result[j] = ScalarC(); - for (int i = 0; i < N; i++) - expected_result[j] += ScalarC(b * h_z(i, j) + a * h_x(i) * h_y(i, j)) * - ScalarC(b * h_z(i, j) + a * h_x(i) * h_y(i, j)); - } - - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - - Kokkos::View r("Dot::Result", K); + // In the operation z = (b*z) + (a*x*y) we estimate + // the largest rounding error to be dominated by max(b*z, a*x*y) + // Since b and a are known and the largest value in z, x and y + // is set by the variables max_val, the error upper bound will be + // max_error = a * max_val * max_val + typename Kokkos::ArithTraits::mag_type const eps = + Kokkos::ArithTraits::epsilon(); + typename Kokkos::ArithTraits::mag_type const max_error = + Kokkos::ArithTraits::abs(a) * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( @@ -258,34 +206,39 @@ void impl_test_team_mult_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x, - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, + x.d_view, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); - for (int k = 0; k < K; k++) { - ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + + Kokkos::deep_copy(z.h_base, z.d_base); + + ScalarC temp; + for (int j = 0; j < K; j++) { + for (int i = 0; i < N; i++) { + temp = ScalarC(b * org_z.h_view(i, j) + a * x.h_view(i) * y.h_view(i, j)); + EXPECT_NEAR_KK(temp, z.h_view(i, j), max_error); + } } - Kokkos::deep_copy(b_z, b_org_z); + // Reset z on device and run again with const y + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x, - Kokkos::subview(c_y, Kokkos::ALL(), teamId)); + teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, + x.d_view, Kokkos::subview(y.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); + Kokkos::deep_copy(z.h_base, z.d_base); + for (int k = 0; k < K; k++) { - ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + for (int i = 0; i < N; ++i) { + temp = ScalarC(b * org_z.h_view(i, k) + a * x.h_view(i) * y.h_view(i, k)); + EXPECT_NEAR_KK(temp, z.h_view(i, k), max_error); + } } - - delete[] expected_result; } } // namespace Test @@ -323,8 +276,7 @@ int test_team_mult() { // Device>(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -384,8 +336,7 @@ int test_team_mult_mv() { // view_type_c_lr, Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 8ac35e5cbc..4bc4836782 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -35,37 +35,24 @@ void impl_test_team_nrm2(int N, int K) { const team_policy policy(K, Kokkos::AUTO); typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_a, rand_pool, ScalarA(10)); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type *expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); + expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); expected_result[j] = - Kokkos::Details::ArithTraits::sqrt( - expected_result[j]); + Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -79,7 +66,7 @@ void impl_test_team_nrm2(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -94,7 +81,7 @@ void impl_test_team_nrm2(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -128,8 +115,7 @@ int test_team_nrm2() { // Test::impl_test_team_nrm2(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_team_nrm2(0, 5); diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index a33d5cd930..e0c109e1af 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -39,57 +39,26 @@ void impl_test_team_scal(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + typedef Kokkos::ArithTraits AT; + + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); ScalarA a(3); typename AT::mag_type eps = AT::epsilon() * 1000; typename AT::mag_type zero = AT::abs(AT::zero()); typename AT::mag_type one = AT::abs(AT::one()); - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA expected_result(0); for (int i = 0; i < N; i++) { - expected_result += ScalarB(a * h_x(i)) * ScalarB(a * h_x(i)); + expected_result += ScalarB(a * x.h_view(i)) * ScalarB(a * x.h_view(i)); } Kokkos::parallel_for( @@ -99,18 +68,20 @@ void impl_test_team_scal(int N) { KokkosBlas::Experimental::scal( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); typename AT::mag_type diff = @@ -118,7 +89,7 @@ void impl_test_team_scal(int N) { EXPECT_NEAR_KK(diff, zero, eps); } - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, @@ -127,18 +98,20 @@ void impl_test_team_scal(int N) { KokkosBlas::Experimental::scal( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { - ScalarB const_nonconst_result = KokkosBlas::dot(y, y); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); typename AT::mag_type diff = @@ -157,46 +130,25 @@ void impl_test_team_scal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; - - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); + typedef Kokkos::ArithTraits AT; - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA a(3); - typename ViewTypeA::const_type c_x = x; ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += ScalarB(a * h_x(i, j)) * ScalarB(a * h_x(i, j)); + expected_result[j] += + ScalarB(a * x.h_view(i, j)) * ScalarB(a * x.h_view(i, j)); } } @@ -211,11 +163,11 @@ void impl_test_team_scal_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), a, - Kokkos::subview(x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_scalar_result = r(k); typename AT::mag_type divisor = @@ -225,18 +177,19 @@ void impl_test_team_scal_mv(int N, int K) { EXPECT_NEAR_KK(diff, zero, eps); } - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), a, - Kokkos::subview(c_x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_scalar_result = r(k); typename AT::mag_type divisor = @@ -258,21 +211,24 @@ void impl_test_team_scal_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += - ScalarB((3.0 + j) * h_x(i, j)) * ScalarB((3.0 + j) * h_x(i, j)); + expected_result[j] += ScalarB((3.0 + j) * x.h_view(i, j)) * + ScalarB((3.0 + j) * x.h_view(i, j)); } } + // Zero out y to run again + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - params(teamId), Kokkos::subview(x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + params(teamId), Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_vector_result = r(k); typename AT::mag_type divisor = @@ -282,18 +238,20 @@ void impl_test_team_scal_mv(int N, int K) { EXPECT_NEAR_KK(diff, zero, eps); } - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - params(teamId), Kokkos::subview(c_x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + params(teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_vector_result = r(k); typename AT::mag_type divisor = @@ -331,8 +289,7 @@ int test_team_scal() { // Test::impl_test_team_scal(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -377,8 +334,7 @@ int test_team_scal_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index fd30cc5bfb..ff593d3eeb 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -111,7 +111,7 @@ template ats; + typedef Kokkos::ArithTraits ats; /// radomized input testing views const ScalarType alpha = 11.1; diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index cf118e7ba2..09b60440ae 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -41,69 +41,34 @@ void impl_test_team_update(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typedef Kokkos::View< - ScalarC * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeC; - ScalarA a = 3; ScalarB b = 5; ScalarC c = 7; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeC b_z("Y", N); - BaseTypeC b_org_z("Org_Z", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - ViewTypeC z = Kokkos::subview(b_z, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(10)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(z.h_base, z.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarC(c * h_z(i) + a * h_x(i) + b * h_y(i)) * - ScalarC(c * h_z(i) + a * h_x(i) + b * h_y(i)); + expected_result += + ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)) * + ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( @@ -113,25 +78,28 @@ void impl_test_team_update(int N) { KokkosBlas::Experimental::update( teamMember, a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC nonconst_nonconst_result = KokkosBlas::dot(z, z); + ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, @@ -140,24 +108,27 @@ void impl_test_team_update(int N) { KokkosBlas::Experimental::update( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_nonconst_result = KokkosBlas::dot(z, z); + ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,c_y,c,z); Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, @@ -166,21 +137,24 @@ void impl_test_team_update(int N) { KokkosBlas::Experimental::update( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - c_y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_const_result = KokkosBlas::dot(z, z); + ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); } @@ -196,60 +170,39 @@ void impl_test_team_update_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - typename vfA_type::BaseType b_x("X", N, K); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(10)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(z.h_base, z.d_base); - ScalarA a = 3; - ScalarB b = 5; - ScalarC c = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; + ScalarC c = 5; ScalarC *expected_result = new ScalarC[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarC(); for (int i = 0; i < N; i++) expected_result[j] += - ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j)) * - ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j)); + ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + + c * z.h_view(i, j)) * + ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)); } - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Dot::Result", K); @@ -259,29 +212,30 @@ void impl_test_team_update_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( - teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId), c, - Kokkos::subview(z, Kokkos::ALL(), teamId)); + teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); + KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } - Kokkos::deep_copy(b_z, b_org_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( - teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId), c, - Kokkos::subview(z, Kokkos::ALL(), teamId)); + teamMember, a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); + KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(const_non_const_result, expected_result[k], @@ -326,8 +280,7 @@ int test_team_update() { // Device>(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -387,8 +340,7 @@ int test_team_update_mv() { // view_type_c_lr, Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index 189dc2afb6..07445f595e 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -27,51 +27,15 @@ void impl_test_update(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typedef Kokkos::View< - ScalarC * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeC; - ScalarA a = 3; ScalarB b = 5; ScalarC c = 7; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeC b_z("Y", N); - BaseTypeC b_org_z("Org_Z", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - ViewTypeC z = Kokkos::subview(b_z, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -79,52 +43,48 @@ void impl_test_update(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, b_z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); - auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - KokkosBlas::update(a, x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + KokkosBlas::update(a, x.d_view, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + + c * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::update(a, c_x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::update(a, x.d_view_const, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + + c * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::update(a, c_x, b, c_y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::update(a, x.d_view_const, b, y.d_view_const, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + + c * org_z.h_view(i)), + z.h_view(i), eps); } } @@ -134,30 +94,10 @@ void impl_test_update_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - typename vfA_type::BaseType b_x("X", N, K); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -165,53 +105,50 @@ void impl_test_update_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, b_z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - ScalarC c = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; + ScalarC c = 5; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - KokkosBlas::update(a, x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + KokkosBlas::update(a, x.d_view, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_y(i, j) + - c * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::update(a, c_x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::update(a, x.d_view_const, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_y(i, j) + - c * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } } @@ -251,31 +188,28 @@ int test_update() { // Device>(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); - // Test::impl_test_update(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update(1024); - Test::impl_test_update(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); + // Test::impl_test_update(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update(1024); + Test::impl_test_update(1024); +#endif return 1; } @@ -314,30 +248,28 @@ int test_update_mv() { Device>(132231, 5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; typedef Kokkos::View - view_type_c_ls; Test::impl_test_update_mv(0, 5); Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(1024, 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 1df115d2c3..dc83ac82f5 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -28,12 +28,9 @@ void impl_test_gemv(const char* mode, int M, int N) { typedef typename ViewTypeY::value_type ScalarY; typedef Kokkos::ArithTraits KAT_Y; - typedef multivector_layout_adapter vfA_type; - - ScalarA alpha = 3; - ScalarY beta = 5; - double eps = - (std::is_same::value ? 1e-2 : 5e-10); + const ScalarA alpha = 3; + ScalarY beta = 5; + typename KAT_Y::mag_type const eps = KAT_Y::epsilon(); int ldx; int ldy; @@ -44,84 +41,79 @@ void impl_test_gemv(const char* mode, int M, int N) { ldx = M; ldy = N; } - typename vfA_type::BaseType b_A("A", M, N); - ViewTypeX x("X", ldx); - ViewTypeY y("Y", ldy); - ViewTypeY org_y("Org_Y", ldy); - - ViewTypeA A = vfA_type::view(b_A); - typename ViewTypeX::const_type c_x = x; - typename ViewTypeA::const_type c_A = A; - - typedef multivector_layout_adapter h_vfA_type; - typename h_vfA_type::BaseType h_b_A = Kokkos::create_mirror_view(b_A); - - typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A); - typename ViewTypeX::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeY::HostMirror h_y = Kokkos::create_mirror_view(y); + view_stride_adapter A("A", M, N); + view_stride_adapter x("X", ldx); + view_stride_adapter y("Y", ldy); + view_stride_adapter org_y("Org_Y", ldy); Kokkos::Random_XorShift64_Pool rand_pool( 13718); + constexpr double max_valX = 1; + constexpr double max_valY = 1; + constexpr double max_valA = 1; { ScalarX randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_valX, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarY randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_valY, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_A, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_valA, randStart, randEnd); + Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(org_y, y); - auto h_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); + const typename KAT_Y::mag_type max_error = + KAT_Y::abs(alpha * max_valA * max_valX * ldx + beta * max_valY); + const typename KAT_Y::mag_type tol = + max_error * eps * 2; // adding small fudge factor of 2 - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_y, y); - Kokkos::deep_copy(h_b_A, b_A); + Kokkos::deep_copy(org_y.h_base, y.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(A.h_base, A.d_base); Kokkos::View expected("expected aAx+by", ldy); - Kokkos::deep_copy(expected, h_org_y); - vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected); + Kokkos::deep_copy(expected, org_y.h_view); + vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); - KokkosBlas::gemv(mode, alpha, A, x, beta, y); - Kokkos::deep_copy(h_y, y); + KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); int numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) + if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) { numErrors++; + std::cerr << __FILE__ << ":" << __LINE__ + << ": expected(i)=" << expected(i) << ", h_y(i)=" << y.h_view(i) + << std::endl; + } } EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y, org_y); - KokkosBlas::gemv(mode, alpha, A, c_x, beta, y); - Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y, org_y); - KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); - Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::gemv(mode, alpha, A.d_view_const, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta @@ -130,15 +122,21 @@ void impl_test_gemv(const char* mode, int M, int N) { // This should overwrite the NaNs with the correct result. beta = KAT_Y::zero(); // beta changed, so update the correct answer - vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected); - Kokkos::deep_copy(y, KAT_Y::nan()); - KokkosBlas::gemv(mode, alpha, A, x, beta, y); - Kokkos::deep_copy(h_y, y); + vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); + Kokkos::deep_copy(y.d_view, KAT_Y::nan()); + KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::isNan(h_y(i)) || - KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) + if (KAT_Y::isNan(y.h_view(i)) || + KAT_Y::abs(expected(i) - y.h_view(i)) > + KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { numErrors++; + std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i + << ")=" << expected(i) << ", h_y(" << i << ")=" << y.h_view(i) + << ", eps=" << eps + << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() << std::endl; + } } EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect"; @@ -203,33 +201,36 @@ int test_gemv(const char* mode) { // Device>(mode,132231,1024); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; typedef Kokkos::View - view_type_c_ls; Test::impl_test_gemv( mode, 0, 1024); Test::impl_test_gemv( mode, 1024, 0); - Test::impl_test_gemv( mode, 13, 13); Test::impl_test_gemv( mode, 13, 1024); Test::impl_test_gemv( mode, 50, 40); - Test::impl_test_gemv( mode, 1024, 1024); Test::impl_test_gemv( mode, 2131, 2131); - // Test::impl_test_gemv(mode,132231,1024); - #endif +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_gemv( + mode, 0, 1024); + Test::impl_test_gemv( + mode, 1024, 0); + Test::impl_test_gemv( + mode, 13, 13); + Test::impl_test_gemv( + mode, 13, 1024); + Test::impl_test_gemv( + mode, 50, 40); + Test::impl_test_gemv( + mode, 1024, 1024); + Test::impl_test_gemv( + mode, 2131, 2131); + // Test::impl_test_gemv(mode,132231,1024); +#endif - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_gemv( mode, 1024, 1024); Test::impl_test_gemv( mode, 1024, 1024); #endif - */ +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_gemv( + mode, 1024, 1024); + Test::impl_test_gemv( + mode, 1024, 1024); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp new file mode 100644 index 0000000000..7e9ed08d88 --- /dev/null +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -0,0 +1,1572 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include +#include + +namespace Test { + +template +class GerTester { + public: + GerTester(); + + ~GerTester(); + + void test(const int M, const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false); + + private: + typedef Kokkos::View _ViewTypeX; + typedef Kokkos::View _ViewTypeY; + typedef Kokkos::View _ViewTypeA; + + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeY::HostMirror _HostViewTypeY; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View + _ViewTypeExpected; + + typedef Kokkos::ArithTraits _KAT_A; + typedef typename _KAT_A::mag_type _AuxType; + + void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, _ViewTypeX& x, + _ViewTypeY& y, _ViewTypeA& A, + bool& expectedResultIsKnown); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _epsAbs; + const _AuxType _epsRel; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _kkGerShouldThrowException; +}; + +template +GerTester::GerTester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr && _testIsGpu) +#else + , + _vanillaUsesDifferentOrderOfOps(false) +#endif + , + _epsAbs(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _epsRel(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _kkGerShouldThrowException(false) { +} + +template +GerTester::~GerTester() { + // Nothing to do +} + +template +void GerTester::test(const int M, const int N, + const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption) { + std::cout << "Entering GerTester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " + << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _epsAbs = " << _epsAbs + << ", _epsRel = " << _epsRel << std::endl; + + // ******************************************************************** + // Step 1 of 9: declare main types and variables + // ******************************************************************** + _M = M; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + if ((_testIsGpu == false) && (_A_is_ll == false)) { + _kkGerShouldThrowException = true; + } else if ((_testIsGpu == true) && (_A_is_ll == false)) { + _kkGerShouldThrowException = true; + } + } +#endif + + bool test_x_y(false); + bool test_cx_y(false); + bool test_x_cy(false); + bool test_cx_cy(false); + if (nonConstConstCombinations == 0) { + test_x_y = true; + } else if (nonConstConstCombinations == 1) { + test_cx_y = true; + } else if (nonConstConstCombinations == 2) { + test_x_cy = true; + } else if (nonConstConstCombinations == 3) { + test_cx_cy = true; + } else { + test_x_y = true; + test_cx_y = true; + test_x_cy = true; + test_cx_cy = true; + } + + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeY, false> y("Y", _N); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); + + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * y^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + ScalarA alpha(0.); + + // ******************************************************************** + // Step 2 of 9: populate alpha, h_x, h_y, h_A, h_expected, x, y, A + // ******************************************************************** + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, + h_expected.d_view, x.d_view, y.d_view, A.d_view, + expectedResultIsKnown); + + // ******************************************************************** + // Step 3 of 9: populate h_vanilla + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * y^{t,h}", _M, _N); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, + h_vanilla.d_view); + + // ******************************************************************** + // Step 4 of 9: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaExpected(alpha, h_vanilla.d_view, h_expected.d_view); + } else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); + } + + // ******************************************************************** + // Step 5 of 9: test with 'non const x' and 'non const y' + // ******************************************************************** + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A.d_base, A.d_base); + + if (test_x_y) { + this->callKkGerAndCompareAgainstExpected( + alpha, x.d_view, y.d_view, A.d_view, A.h_view, h_expected.d_view, + "non const {x,y}"); + } + + // ******************************************************************** + // Step 6 of 9: test with const x + // ******************************************************************** + if (test_cx_y) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, + A.d_view, A.h_view, + h_expected.d_view, "const x"); + } + + // ******************************************************************** + // Step 7 of 9: test with const y + // ******************************************************************** + if (test_x_cy) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, + A.d_view, A.h_view, + h_expected.d_view, "const y"); + } + + // ******************************************************************** + // Step 8 of 9: test with const x and const y + // ******************************************************************** + if (test_cx_cy) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, + y.d_view_const, A.d_view, A.h_view, + h_expected.d_view, "const {x,y}"); + } + + // ******************************************************************** + // Step 9 of 9: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW(KokkosBlas::ger(".", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk ger should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW(KokkosBlas::ger("", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk ger should have thrown an exception for mode ''"; + + std::cout << "Leaving GerTester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; +} + +template +void GerTester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, + _ViewTypeX& x, _ViewTypeY& y, + _ViewTypeA& A, + bool& expectedResultIsKnown) { + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + expectedResultIsKnown = true; + } else if ((_M == 1) && (_N == 1)) { + alpha = 3; + + h_x[0] = 2; + + h_y[0] = 3; + + h_A(0, 0) = 7; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0, 0) = 25; + expectedResultIsKnown = true; + } else if ((_M == 1) && (_N == 2)) { + alpha = 3; + + h_x[0] = 2; + + h_y[0] = 3; + h_y[1] = 4; + + h_A(0, 0) = 7; + h_A(0, 1) = -6; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0, 0) = 25; + h_expected(0, 1) = 18; + expectedResultIsKnown = true; + } else if ((_M == 2) && (_N == 2)) { + alpha = 3; + + h_x[0] = 2; + h_x[1] = 9; + + h_y[0] = -3; + h_y[1] = 7; + + h_A(0, 0) = 17; + h_A(0, 1) = -43; + h_A(1, 0) = 29; + h_A(1, 1) = 101; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0, 0) = -1; + h_expected(0, 1) = -1; + h_expected(1, 0) = -52; + h_expected(1, 1) = 290; + expectedResultIsKnown = true; + } else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + } + + { + ScalarY randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(y, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(h_A, A); + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + _AuxType auxI(0.); + _AuxType auxJ(0.); + _AuxType auxIpJ(0.); + _AuxType auxImJ(0.); + + alpha.real() = 1.; + alpha.imag() = -1.; + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i].real() = sin(auxI); + h_x[i].imag() = cos(auxI); + } + + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_y[j].real() = cos(auxJ); + h_y[j].imag() = sin(auxJ); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = + -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + h_A(i, j).imag() = + -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + } + } + } else { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_A(i, j).real() = + -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i, j).imag() = + -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = -2. * sin(auxI) * sin(auxJ); + h_expected(i, j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); + } + } + } else { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 2. * cos(auxI) * cos(auxJ); + h_expected(i, j).imag() = -2. * sin(auxImJ); + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + _AuxType auxI(0.); + _AuxType auxJ(0.); + _AuxType auxIpJ(0.); + + alpha = 3; + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i] = sin(auxI); + } + + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_y[j] = cos(auxJ); + } + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_A(i, j) = 3 * cos(auxI) * sin(auxJ); + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j) = 3 * sin(auxIpJ); + } + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i, j) = h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i); + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_y(j) * h_x(i); + } + } + } + } else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)); + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_y(j) * h_x(i); + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j); + } + } + } +} + +template +template +T GerTester::shrinkAngleToZeroTwoPiRange(const T input) { + T output(input); +#if 0 + T twoPi( 2. * Kokkos::numbers::pi ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::compareVanillaExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + errorHappened = false; + if (h_expected(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; + } + + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + errorHappened = false; + if (h_expected(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + } else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { + if (numErrorsReal == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << std::endl; + } + numErrorsReal++; + } + + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { + if (numErrorsImag == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << std::endl; + } + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::compareVanillaExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); + errorHappened = false; + if (h_expected(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j) != h_vanilla(i, j)) { + if (numErrors == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; + } + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::compareKokkosExpected(const T& alpha, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j).real() - h_A(i, j).real()); + errorHappened = false; + if (h_expected(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; + } + + diff = _KAT_A::abs(h_expected(i, j).imag() - h_A(i, j).imag()); + errorHappened = false; + if (h_expected(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; + } + } // for j + } // for i + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(11, 2119) = (" << h_expected(11, 2119).real() + << ", " << h_expected(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(710, 1065) = (" << h_expected(710, 1065).real() + << ", " << h_expected(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; + } + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": ger result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": ger result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::compareKokkosExpected(const T& alpha, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j) - h_A(i, j)); + errorHappened = false; + if (h_expected(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_A(i,j) = " << h_A(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; + } + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": ger result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void GerTester:: + callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, y, A); + } catch (const std::exception& e) { + std::cout << "In Test_Blas2_ger, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; + gotStdException = true; + } catch (...) { + std::cout << "In Test_Blas2_ger, '" << situation + << "': caught unknown exception" << std::endl; + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkGerShouldThrowException) + << "Failed test, '" << situation << "': kk ger() should" + << (_kkGerShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if ((gotStdException == false) && (gotUnknownException == false)) { + Kokkos::deep_copy(h_A, A); + + this->compareKokkosExpected(alpha, h_A, h_expected); + } +} + +} // namespace Test + +template +int test_ger(const std::string& caseName) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", + caseName.c_str(), typeid(Device).name()); + + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool yBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool useAnalyticalResults = xBool && yBool && aBool; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); + + if (true) { + Test::GerTester + tester; + tester.test(0, 13, 0); + tester.test(1024, 0, 0); + tester.test(1, 1, 0); + tester.test(2, 2, 0); + tester.test(1, 2, 0); + tester.test(13, 13, 0); + tester.test(13, 1024, 0); + if (useAnalyticalResults) { + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + } else { + tester.test(13, 1024, 0, false, true); + } + tester.test(50, 40, 4); + tester.test(1024, 1024, 0); + tester.test(2131, 2131, 0); + if (useAnalyticalResults) { + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } else { + tester.test(2131, 2131, 0, false, true); + } + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", + caseName.c_str()); + + if (true) { + Test::GerTester + tester; + tester.test(0, 13, 0); + tester.test(1024, 0, 0); + tester.test(1, 1, 0); + tester.test(2, 2, 0); + tester.test(1, 2, 0); + tester.test(13, 13, 0); + tester.test(13, 1024, 0); + if (useAnalyticalResults) { + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + } else { + tester.test(13, 1024, 0, false, true); + } + tester.test(50, 40, 4); + tester.test(1024, 1024, 0); + tester.test(2131, 2131, 0); + if (useAnalyticalResults) { + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } else { + tester.test(2131, 2131, 0, false, true); + } + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", + caseName.c_str()); + + if (true) { + Test::GerTester + tester; + tester.test(0, 13, 0); + tester.test(1024, 0, 0); + tester.test(13, 13, 0); + tester.test(13, 1024, 0); + if (useAnalyticalResults) { + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + } else { + tester.test(13, 1024, 0, false, true); + } + tester.test(50, 40, 4); + tester.test(1024, 1024, 0); + tester.test(2131, 2131, 0); + if (useAnalyticalResults) { + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } else { + tester.test(2131, 2131, 0, false, true); + } + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", + caseName.c_str()); + + if (true) { + Test::GerTester + tester; + tester.test(1024, 1024, 0); + if (useAnalyticalResults) { + tester.test(1024, 1024, 0, true, false); + tester.test(1024, 1024, 0, true, true); + } else { + tester.test(1024, 1024, 0, false, true); + } + } + + if (true) { + Test::GerTester + tester; + tester.test(1024, 1024, 0); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#endif + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); + test_ger("test case ger_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); + test_ger, Kokkos::complex, + Kokkos::complex, TestExecSpace>( + "test case ger_complex_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); + test_ger("test case ger_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); + test_ger, Kokkos::complex, + Kokkos::complex, TestExecSpace>( + "test case ger_complex_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); + test_ger("test case ger_int"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, ger_double_int_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); + test_ger("test case ger_mixed_types"); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index 7c3b140611..13c52ec437 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -91,7 +93,8 @@ void build_matrices(const int M, const int N, const int K, // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in // Trilinos issue #6418 - const uint64_t seed = Kokkos::Impl::clock_tic(); + const uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); Kokkos::fill_random(A, rand_pool, Kokkos::rand APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; KOKKOS_INLINE_FUNCTION @@ -174,7 +177,7 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; double machine_eps = APT::epsilon(); @@ -184,7 +187,8 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, ViewTypeC C("C", M, N); ViewTypeC C2("C", M, N); - const uint64_t seed = Kokkos::Impl::clock_tic(); + const uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in @@ -253,16 +257,15 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } } -template -void impl_test_stream_gemm(const int M, const int N, const int K, - const Scalar alpha, const Scalar beta) { - using execution_space = TestExecSpace; - using ViewTypeA = Kokkos::View; - using ViewTypeB = Kokkos::View; - using ViewTypeC = Kokkos::View; - using ScalarC = typename ViewTypeC::value_type; - using APT = Kokkos::Details::ArithTraits; - using mag_type = typename APT::mag_type; +template +void impl_test_stream_gemm_psge2(const int M, const int N, const int K, + const Scalar alpha, const Scalar beta) { + using ViewTypeA = Kokkos::View; + using ViewTypeB = Kokkos::View; + using ViewTypeC = Kokkos::View; + using ScalarC = typename ViewTypeC::value_type; + using APT = Kokkos::ArithTraits; + using mag_type = typename APT::mag_type; const char tA[] = {"N"}; const char tB[] = {"N"}; @@ -368,12 +371,17 @@ void test_gemm() { } } } - Test::impl_test_stream_gemm(53, 42, 17, 4.5, - 3.0); // General code path - Test::impl_test_stream_gemm( - 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm(7, 13, 17, 4.5, - 3.0); // dot based gemm code path + auto pool_size = TestExecSpace().concurrency(); + if (pool_size >= 2) { + Test::impl_test_stream_gemm_psge2( + 53, 42, 17, 4.5, + 3.0); // General code path + Test::impl_test_stream_gemm_psge2( + 13, 1, 17, 4.5, 3.0); // gemv based gemm code path + Test::impl_test_stream_gemm_psge2( + 7, 13, 17, 4.5, + 3.0); // dot based gemm code path + } } template diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index bf44debaf4..188999c5e0 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template @@ -54,7 +56,7 @@ struct trmm_VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -100,7 +102,7 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, const char* diag, int M, int N, Scalar alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; double machine_eps = APT::epsilon(); @@ -110,12 +112,13 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, ViewTypeA A("A", K, K); ViewTypeB B("B", M, N); ViewTypeB B_expected("B_expected", M, N); - uint64_t seed = Kokkos::Impl::clock_tic(); - ScalarA beta = ScalarA(0); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); // printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps // %g, ViewType: %s\n", - // Kokkos::Details::ArithTraits::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name()); + // Kokkos::ArithTraits::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name()); typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A); typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B); diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 5857f7a533..5edd175652 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template @@ -54,7 +56,7 @@ struct trsm_VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -102,7 +104,7 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, typename ViewTypeA::value_type alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; double machine_eps = APT::epsilon(); @@ -121,7 +123,8 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror_view(B); typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); if ((diag[0] == 'U') || (diag[0] == 'u')) { diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/blas/unit_test/Test_Blas_gesv.hpp index 207a06db07..710102137e 100644 --- a/blas/unit_test/Test_Blas_gesv.hpp +++ b/blas/unit_test/Test_Blas_gesv.hpp @@ -13,13 +13,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -// Note: Luc Berger-Vergiat 04/15/21 -// This test should only be included -// in the CUDA backend if TPL MAGMA -// has been enabled. -#if !defined(TEST_CUDA_BLAS_CPP) || \ - (defined(TEST_CUDA_BLAS_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) +// only enable this test where KokkosBlas supports gesv: +// CUDA+MAGMA and HOST+BLAS +#if (defined(TEST_CUDA_BLAS_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ + (defined(TEST_OPENMP_BLAS_CPP) || defined(TEST_OPENMPTARGET_BLAS_CPP) || \ + defined(TEST_SERIAL_BLAS_CPP) || defined(TEST_THREADS_BLAS_CPP))) #include #include @@ -36,7 +37,7 @@ template void impl_test_gesv(const char* mode, const char* padding, int N) { typedef typename Device::execution_space execution_space; typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -128,9 +129,9 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { if (ats::abs(h_B(i) - h_X0(i)) > eps) { test_flag = false; // printf( " Error %d, pivot %c, padding %c: result( %.15lf ) != - // solution( %.15lf ) at (%ld)\n", N, mode[0], padding[0], - // ats::abs(h_B(i)), ats::abs(h_X0(i)), i ); - break; + // solution( %.15lf ) at (%d)\n", N, mode[0], padding[0], + // ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i) ); + // break; } } ASSERT_EQ(test_flag, true); @@ -141,7 +142,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) { typedef typename Device::execution_space execution_space; typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -337,9 +338,6 @@ int test_gesv_mrhs(const char* mode) { return 1; } -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) - #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -414,6 +412,4 @@ TEST_F(TestCategory, gesv_mrhs_complex_float) { } #endif -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA || KOKKOSKERNELS_ENABLE_TPL_BLAS - -#endif // Check for TPL MAGMA when compiling the CUDA tests +#endif // CUDA+MAGMA or BLAS+HOST diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/blas/unit_test/Test_Blas_trtri.hpp index d333b963b4..0bebb9edf0 100644 --- a/blas/unit_test/Test_Blas_trtri.hpp +++ b/blas/unit_test/Test_Blas_trtri.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template @@ -53,7 +55,7 @@ struct VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -99,7 +101,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, const int M, const int N) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; double machine_eps = APT::epsilon(); @@ -109,8 +111,9 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, ViewTypeA A("A", M, N); ViewTypeA A_original("A_original", M, N); ViewTypeA A_I("A_I", M, N); // is I taken...? - uint64_t seed = Kokkos::Impl::clock_tic(); - ScalarA beta = ScalarA(0); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); ScalarA cur_check_val; // Either 1 or 0, to check A_I // const int As0 = A.stride(0), As1 = A.stride(1); diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index f66253a5f6..913b4e67a5 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -263,9 +263,9 @@ display_help_text() { echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" echo " [AMD: GPU]" - echo " VEGA900 = AMD GPU MI25 GFX900" echo " VEGA906 = AMD GPU MI50/MI60 GFX906" - echo " VEGA908 = AMD GPU" + echo " VEGA908 = AMD GPU MI100 GFX908" + echo " VEGA90A = AMD GPU MI200 series GFX90A" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" @@ -358,16 +358,17 @@ display_help_text() { # echo "--with-hpx-options=[OPT]: Additional options to HPX:" # echo " enable_async_dispatch" echo "--no-default-eti: Do not include default ETI types for Kokkos Kernels" + echo "--disable-test-eti-only: Do not restrict testing to ETI types for Kokkos Kernels" echo "--gcc-toolchain=/Path/To/GccRoot: Set the gcc toolchain to use with clang (e.g. /usr)" echo "--kokkos-make-j=[NUM]: Set -j parallel level for kokkos install" echo " Default: j == 4" - echo "--enable-tests: build Kokkos Kernels unit tests" - echo "--disable-tests: Do not build Kokkos Kernels unit tests" - echo "--disable-perftests: Do not build Kokkos Kernels performance tests" - echo "--enable-perftests: build Kokkos Kernels performance tests (default)" + echo "--enable-tests: build Kokkos Kernels unit tests" + echo "--disable-tests: Do not build Kokkos Kernels unit tests" + echo "--disable-perftests: Do not build Kokkos Kernels performance tests" + echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" - echo "--export-compile-commands: export cmake compile_commands.json file" - + echo "--export-compile-commands: export cmake compile_commands.json file" + echo "--enable-docs: build the Kokkos Kernels developer documentation (requires sphinx, doxygen)" } @@ -379,6 +380,7 @@ KOKKOSKERNELS_DO_TESTS=ON KOKKOSKERNELS_DO_PERFTESTS=ON KOKKOSKERNELS_DO_PERFSUITE=OFF KOKKOSKERNELS_DO_EXAMPLES=ON +KOKKOSKERNELS_DO_DOCS=OFF CMAKE_EXPORT_COMPILE_COMMANDS=OFF @@ -512,6 +514,9 @@ do --no-default-eti) KERNELS_DEFAULT_ETI_OPTION="-DKokkosKernels_ADD_DEFAULT_ETI=OFF" ;; + --disable-test-eti-only) + KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION="-DKokkosKernels_TEST_ETI_ONLY=OFF" + ;; --kokkos-release) KOKKOS_RELEASE=ON ;; @@ -569,6 +574,9 @@ do --deprecated-code) KOKKOS_DEPRECATED_CODE=ON ;; + --enable-docs) + KOKKOSKERNELS_DO_DOCS=ON + ;; --compiler*) COMPILER="${key#*=}" CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l) @@ -816,6 +824,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d3b393ddde..777d4445b3 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms + LIB_REQUIRED_PACKAGES Kokkos LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) diff --git a/cmake/KokkosKernels_Version_Info.hpp.in b/cmake/KokkosKernels_Version_Info.hpp.in new file mode 100644 index 0000000000..62bcaed88c --- /dev/null +++ b/cmake/KokkosKernels_Version_Info.hpp.in @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_VERSION_INFO_HPP +#define KOKKOSKERNELS_VERSION_INFO_HPP + +#include + +namespace KokkosKernels { +namespace Impl { + +constexpr std::string_view GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; +constexpr std::string_view GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; +constexpr std::string_view GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; +constexpr std::string_view GIT_COMMIT_DESCRIPTION = + R"message(@GIT_COMMIT_DESCRIPTION@)message"; +constexpr std::string_view GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; +constexpr std::string_view BENCHMARK_VERSION = "@BENCHMARK_VERSION@"; + +} // namespace Impl +} // namespace KokkosKernels + +#endif // KOKKOSKERNELS_VERSION_INFO_HPP diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 22a6cd9416..22b7a196fc 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -31,6 +31,7 @@ #cmakedefine HAVE_KOKKOSKERNELS_MKL #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE +#cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK /* Define this macro if experimental features of Kokkoskernels are enabled */ #cmakedefine HAVE_KOKKOSKERNELS_EXPERIMENTAL diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 3d5f297f52..52f4571976 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -1,4 +1,30 @@ -IF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") +find_package(MKL) +IF(TARGET MKL::MKL) + # MKL version >= 2021 (see kokkos wiki and intel documentation. MKL CMake module file has been introduced starting MKL >= 2021) + IF (KOKKOS_ENABLE_SYCL) #get from kokkos-core + # MKL version >= 2022 (see kokkos wiki) + IF (NOT TARGET MKL::MKL_DPCPP) + MESSAGE(FATAL_ERROR "KOKKOS_ENABLE_SYCL activated but the target MKL_DPCPP wasn't found") + ENDIF() + ENDIF() + SET(TPL_MKL_IMPORTED_NAME MKL::MKL) + SET(TPL_IMPORTED_NAME MKL::MKL) + ADD_LIBRARY(MKL INTERFACE) + IF(KOKKOS_ENABLE_SYCL) + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL MKL::MKL_DPCPP) + ELSE() + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL ) + ENDIF() + ADD_LIBRARY(KokkosKernels::MKL ALIAS MKL ) + GET_TARGET_PROPERTY(LIB_TYPE ${TPL_IMPORTED_NAME} TYPE) + MESSAGE("LIB_TYPE: ${LIB_TYPE}") + # kokkoskernels_export_imported_tpl install MKL with target name MKL instead of + # MKL::MKL or KokkosKernels::MKL, so we need to install a specific ALIAS one + if(TARGET MKL) + MESSAGE("TARGET MKL CREATED") + ENDIF() +ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") +# Regular way with MKL version < 2021 (Where MKL doesn't provide cmake module file) TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/mkl.cpp @@ -46,5 +72,5 @@ ELSE() HEADER_PATHS ${MKL_ROOT}/include ) - ENDIF() + ENDIF() ENDIF() diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index 9346475f91..a90ad69bf0 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -16,9 +16,3 @@ CHECK_KOKKOS_BACKEND(OPENMPTARGET) CHECK_KOKKOS_BACKEND(CUDA) CHECK_KOKKOS_BACKEND(HIP) CHECK_KOKKOS_BACKEND(SYCL) - -# for backward compatibility. can be dropped when requiring Kokkos 3.6 -IF (Kokkos_ENABLE_PTHREAD) - SET(KOKKOS_ENABLE_THREADS ON) - SET(KOKKOSKERNELS_INST_EXECSPACE_THREADS_DEFAULT ON) -ENDIF() diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake new file mode 100644 index 0000000000..3a38feee88 --- /dev/null +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -0,0 +1,83 @@ +IF(KOKKOSKERNELS_HAS_TRILINOS) + MESSAGE( + FATAL_ERROR + "Benchmarks are not supported when building as part of Trilinos") +ENDIF() + +FIND_PACKAGE(benchmark QUIET) + +IF(benchmark_FOUND) + MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") +ELSE() + MESSAGE(STATUS "No installed google benchmark found, fetching from GitHub") + INCLUDE(FetchContent) + SET(BENCHMARK_ENABLE_TESTING OFF) + + LIST(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") + + # Note: recent bug (google/benchmark#1441) is preventing us from using + # the latest benchmark release. + SET(BENCHMARK_VERSION 1.6.2) + FetchContent_Declare( + googlebenchmark + URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz + URL_HASH MD5=14d14849e075af116143a161bc3b927b + ) + FetchContent_MakeAvailable(googlebenchmark) + LIST(POP_BACK CMAKE_MESSAGE_INDENT) + + TARGET_COMPILE_OPTIONS(benchmark PRIVATE -w) + TARGET_COMPILE_OPTIONS(benchmark_main PRIVATE -w) +ENDIF() + +FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) + CMAKE_PARSE_ARGUMENTS( + BENCHMARK + "" + "" + "SOURCES" + ${ARGN} + ) + + IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + MESSAGE( + WARNING + "Unexpected arguments when adding a benchmark: " + ${BENCHMARK_UNPARSED_ARGUMENTS} + ) + ENDIF() + + SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) + + ADD_EXECUTABLE( + ${BENCHMARK_NAME} + ${BENCHMARK_SOURCES} + ) + TARGET_LINK_LIBRARIES( + ${BENCHMARK_NAME} + PRIVATE benchmark::benchmark Kokkos::kokkoskernels + ) + TARGET_INCLUDE_DIRECTORIES( + ${BENCHMARK_NAME} + SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include + ) + + FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) + SET_SOURCE_FILES_PROPERTIES( + ${SOURCE_FILE} + PROPERTIES LANGUAGE CXX + ) + ENDFOREACH() + + STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + SET( + BENCHMARK_ARGS + --benchmark_counters_tabular=true + --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json + ) + + ADD_TEST( + NAME ${BENCHMARK_NAME} + COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} + ) +ENDFUNCTION() diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index 56ab1a7c31..84c68658b7 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -44,6 +44,12 @@ KOKKOSKERNELS_ADD_OPTION( BOOL "Whether to build the graph component. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + "ENABLE_COMPONENT_ODE" + OFF + BOOL + "Whether to build the ode component. Default: OFF" +) # Graph depends on everything else because it depends @@ -70,6 +76,7 @@ IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_ODE ON CACHE BOOL "" FORCE) ENDIF() # KOKKOSKERNELS_ALL_COMPONENTS_ENABLED says whether all components are on, @@ -79,9 +86,10 @@ ENDIF() IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED AND KokkosKernels_ENABLE_COMPONENT_BLAS AND KokkosKernels_ENABLE_COMPONENT_GRAPH - AND KokkosKernels_ENABLE_COMPONENT_SPARSE) + AND KokkosKernels_ENABLE_COMPONENT_SPARSE + AND KokkosKernels_ENABLE_COMPONENT_ODE) SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED ON CACHE BOOL "" FORCE) ELSE() SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) ENDIF() -mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) diff --git a/cmake/kokkoskernels_eti.cmake b/cmake/kokkoskernels_eti.cmake index 1823bf96b6..524cad11f9 100644 --- a/cmake/kokkoskernels_eti.cmake +++ b/cmake/kokkoskernels_eti.cmake @@ -130,7 +130,6 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) ${ARGN}) STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) - SET(ETI_DECL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_DECL") SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") @@ -152,9 +151,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(APPEND MACRO_STRING ")") STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) #Make a single header file for all instances - LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") - SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") #Make a different source file for each instance SET(INST_SOURCE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") SET(INST_TEMPLATE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") @@ -169,17 +166,12 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) SET(AVAIL_HEADER "${ETI_COMPONENTS}/eti/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp") SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in") - SET(DECL_HEADER "${ETI_COMPONENTS}/eti/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_decl.hpp") - SET(DECL_TEMPLATE "${DECL_HEADER}.in") STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK "${${UPPER_NAME}_ETI_INST_LIST}") STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}") - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) - LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) ENDMACRO(KOKKOSKERNELS_GENERATE_ETI) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index ea03953d29..d223e00171 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -46,24 +46,15 @@ IF(KOKKOS_ENABLE_CUDA) "Whether to pre instantiate kernels for the execution space Kokkos::Cuda. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) - # By default, instantiate only for Cuda's default memory space (either CudaSpace, or CudaUVMSpace). - IF(KOKKOS_ENABLE_CUDA_UVM) - SET(CUDA_CUDAUVMSPACE_DEFAULT ON) - SET(CUDA_CUDASPACE_DEFAULT OFF) - ELSE() - SET(CUDA_CUDAUVMSPACE_DEFAULT OFF) - SET(CUDA_CUDASPACE_DEFAULT ON) - ENDIF() - KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDAUVMSPACE - ${CUDA_CUDAUVMSPACE_DEFAULT} + OFF BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: OFF." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDASPACE - ${CUDA_CUDASPACE_DEFAULT} + ON BOOL "Whether to pre instantiate kernels for the memory space Kokkos::CudaSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) @@ -136,7 +127,7 @@ IF(KOKKOS_ENABLE_OPENMPTARGET) INST_EXECSPACE_OPENMPTARGET ${KOKKOSKERNELS_INST_EXECSPACE_OPENMPTARGET_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::OpenMPTarget. Disabling this when Kokkos_ENABLE_OpenMPTarget is enabled may increase build times. Default: ON if Kokkos is OpenMPTarget-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::OpenMPTarget. Disabling this when Kokkos_ENABLE_OPENMPTARGET is enabled may increase build times. Default: ON if Kokkos is OpenMPTarget-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_OPENMPTARGETSPACE @@ -172,7 +163,7 @@ KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_OPENMP ${KOKKOSKERNELS_INST_EXECSPACE_OPENMP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OpenMP is enabled may increase build times. Default: ON if Kokkos is OpenMP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OPENMP is enabled may increase build times. Default: ON if Kokkos is OpenMP-enabled, OFF otherwise." ) IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP AND KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE) LIST(APPEND DEVICE_LIST "") diff --git a/cmake/kokkoskernels_eti_layouts.cmake b/cmake/kokkoskernels_eti_layouts.cmake index 38835c129d..647d835353 100644 --- a/cmake/kokkoskernels_eti_layouts.cmake +++ b/cmake/kokkoskernels_eti_layouts.cmake @@ -1,3 +1,7 @@ +SET(RIGHT_LAYOUTS + LAYOUTRIGHT) +SET(LEFT_LAYOUTS + LAYOUTLEFT) SET(LAYOUTS LAYOUTLEFT LAYOUTRIGHT) diff --git a/cmake/kokkoskernels_version_info.cmake b/cmake/kokkoskernels_version_info.cmake new file mode 100644 index 0000000000..33c7c222e6 --- /dev/null +++ b/cmake/kokkoskernels_version_info.cmake @@ -0,0 +1,107 @@ +# https://jonathanhamberg.com/post/cmake-embedding-git-hash/ + +find_package(Git QUIET) + +SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +SET(pre_configure_file ${CURRENT_LIST_DIR}/KokkosKernels_Version_Info.hpp.in) +SET(post_configure_file ${CMAKE_BINARY_DIR}/KokkosKernels_Version_Info.hpp) + +FUNCTION(check_git_write git_hash git_clean_status) + FILE( + WRITE + ${CMAKE_BINARY_DIR}/git-state.txt + "${git_hash}-${git_clean_status}") +ENDFUNCTION() + +FUNCTION(check_git_read git_hash) + IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + LIST(GET CONTENT 0 var) + + message(DEBUG "Cached Git hash: ${var}") + SET(${git_hash} ${var} PARENT_SCOPE) + else() + SET(${git_hash} "INVALID" PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + +FUNCTION(check_git_version) + IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOSKERNELS_TOP_SOURCE_DIR}/.git) + configure_file(${pre_configure_file} ${post_configure_file} @ONLY) + return() + ENDIF() + + # Get the current working branch + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Get the latest commit description + execute_process( + COMMAND ${GIT_EXECUTABLE} show -s --format=%s + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Get the latest commit date + execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_DATE + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Check if repo is dirty / clean + execute_process( + COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + RESULT_VARIABLE IS_DIRTY + OUTPUT_STRIP_TRAILING_WHITESPACE) + + IF(IS_DIRTY EQUAL 0) + SET(GIT_CLEAN_STATUS "CLEAN") + else() + SET(GIT_CLEAN_STATUS "DIRTY") + ENDIF() + + # Get the latest abbreviated commit hash of the working branch + execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --format=%h + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE) + + check_git_read(GIT_HASH_CACHE) + + # Only update the version header if the hash has changed. This will + # prevent us from rebuilding the project more than we need to. + IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} + OR NOT EXISTS ${post_configure_file}) + # Set the GIT_HASH_CACHE variable so the next build won't have + # to regenerate the source file. + check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) + + configure_file(${pre_configure_file} ${post_configure_file} @ONLY) + message(STATUS "Configured git information in ${post_configure_file}") + ENDIF() +ENDFUNCTION() + +# Pass BENCHMARK_VERSION variable to configure benchmark library version +FUNCTION(check_version_info) + add_custom_target( + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} + -DRUN_CHECK_GIT_VERSION=1 + -DKOKKOSKERNELS_TOP_SOURCE_DIR=${KOKKOSKERNELS_TOP_SOURCE_DIR} + -DBENCHMARK_VERSION=${BENCHMARK_VERSION} + -P ${CURRENT_LIST_DIR}/kokkoskernels_version_info.cmake + BYPRODUCTS ${post_configure_file}) + + add_dependencies(kokkoskernels AlwaysCheckGit) + check_git_version() +ENDFUNCTION() + +# This is used to run this function from an external cmake process. +IF(RUN_CHECK_GIT_VERSION) + check_git_version() +ENDIF() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 72972b5cd7..88bf237274 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,9 +1,3 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) -LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src/impl) -LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src/tpls) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) - -# Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/common) -KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/common) diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in b/common/src/KokkosKernels_AlwaysFalse.hpp similarity index 56% rename from sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in rename to common/src/KokkosKernels_AlwaysFalse.hpp index 4eb5388da1..36f4572d29 100644 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in +++ b/common/src/KokkosKernels_AlwaysFalse.hpp @@ -14,16 +14,26 @@ // //@HEADER -#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP +#define KOKKOSKERNELS_ALWAYSFALSE_HPP -namespace KokkosSparse { -namespace Experimental { +#include + +/*! \file KokkosKernels_AlwaysFalse.hpp + \brief A convenience type to be used in a static_assert that should always + fail +*/ + +namespace KokkosKernels { namespace Impl { -// clang-format off -@SPARSE_SPMV_MV_BSRMATRIX_ETI_DECL_BLOCK@ -// clang-format on + +template +using always_false = std::false_type; + +template +inline constexpr bool always_false_v = always_false::value; + } // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse -#endif \ No newline at end of file +} // namespace KokkosKernels + +#endif // KOKKOSKERNELS_ALWAYSFALSE_HPP diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 9ebb104378..4d732a8437 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -62,6 +62,8 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, * * For _MSG checks, the msg argument can contain '<<' if not a kernel check. * + * KK_USER_REQUIRE* are for checking user inputs + * * This code is adapted from EKAT/src/ekat/ekat_assert.hpp */ @@ -103,6 +105,10 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #define KK_REQUIRE_MSG(condition, msg) \ IMPL_THROW(condition, msg, std::logic_error) +#define KK_USER_REQUIRE(condition) IMPL_THROW(condition, "", std::runtime_error) +#define KK_USER_REQUIRE_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::runtime_error) + #define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") #define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) diff --git a/common/src/KokkosKernels_Iota.hpp b/common/src/KokkosKernels_Iota.hpp new file mode 100644 index 0000000000..5b7e24ca24 --- /dev/null +++ b/common/src/KokkosKernels_Iota.hpp @@ -0,0 +1,144 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_IOTA_HPP +#define _KOKKOSKERNELS_IOTA_HPP + +#include + +#include + +#include "KokkosKernels_Error.hpp" + +/*! \file KokkosKernels_Iota.hpp + * Define an Iota struct that implements a small subset of Kokkos::View and + * related utilities. + */ + +namespace KokkosKernels { +namespace Impl { + +/*! \class Iota + \brief A class that mimics a small subset of Kokkos::View + + \tparam T the type returned by operator() + \tparam SizeType a custom offset type + + \typedef size_type SizeType + \typedef value_type T + \typedef non_const_value_type non-const T + \typedef device_type void + \typedef data_type const value_type * + \enum rank always 1 + + Iota::operator() returns offset + i + Meant to be used in place of a Kokkos::View where entry i holds i + offset. + Unlike a Kokkos::View, Iota is not materialized in memory. + + Constructing with a size less than 0 yeilds a 0-size Iota +*/ +template +class Iota { + public: + using size_type = SizeType; + using value_type = T; + using non_const_value_type = std::remove_const; + using device_type = void; + using data_type = const value_type *; + + /*! \brief construct an Iota where iota(i) -> offset + i + + \param[in] size the number of entries + \param[in] offset the offset of the first entry + + Constructing with size < 0 yeilds a 0-size Iota + */ + KOKKOS_INLINE_FUNCTION + constexpr Iota(const size_type &size, const value_type offset) + : size_(size), offset_(offset) { + if constexpr (std::is_signed_v) { + if (size_ < size_type(0)) { + size_ = 0; + } + } + } + + /*! \brief construct an Iota where iota(i) -> i + + \param[in] size the number of entries + */ + KOKKOS_INLINE_FUNCTION + explicit constexpr Iota(const size_type &size) : Iota(size, 0) {} + + /*! \brief construct a zero-sized iota + */ + KOKKOS_INLINE_FUNCTION + constexpr Iota() : size_(0), offset_(0) {} + + /*! \brief Construct Iota subview + + Like the Kokkos::View 1D subview constructor: + \verbatim + Kokkos::View a(10); // size = 10 + Kokkos::View b(a, Kokkos::pair{3,7}); // entries 3,4,5,6 of a + + Iota a(10); + Iota b(a, Kokkos::pair{3,7}); // entries // 3,4,5,6 of a + \endverbatim + + Creating a subview outside of the base Iota yeilds undefined behavior + */ + template + KOKKOS_INLINE_FUNCTION constexpr Iota(const Iota &base, + const Kokkos::pair &range) + : Iota(range.second - range.first, base.offset_ + range.first) {} + + /*! \brief Construct Iota subview + + i >= size() or i < 0 yields undefined behavior. + */ + KOKKOS_INLINE_FUNCTION + constexpr T operator()(size_type i) const noexcept { + return value_type(i + offset_); + }; + + /// \brief return the size of the iota + KOKKOS_INLINE_FUNCTION + constexpr size_t size() const noexcept { return size_; } + + /// \brief Iotas are always like a rank-1 Kokkos::View + enum { rank = 1 }; + + private: + size_type size_; + value_type offset_; +}; + +/// \class is_iota +/// \brief is_iota::value is true if T is a Iota<...>, false otherwise +template +struct is_iota : public std::false_type {}; +template +struct is_iota> : public std::true_type {}; +template +struct is_iota> : public std::true_type {}; +template +inline constexpr bool is_iota_v = is_iota::value; + +} // namespace Impl +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_IOTA_HPP diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp new file mode 100644 index 0000000000..22df9545ef --- /dev/null +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -0,0 +1,470 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_LOWERBOUND_HPP +#define _KOKKOSKERNELS_LOWERBOUND_HPP + +/*! \file KokkosKernels_LowerBound.hpp + Define thread and team-collaborative lower-bound search + + Lower-bound search takes a Kokkos::View, a search value, and a binary + predicate. + It returns an index to the first element of the view that does not + satisfy pred(element, value), or the size of the view if no such + element exists. + + All elements for which pred(element, value) is true must precede those + for which it is false. + + The default predicate is less-than, i.e. pred(a,b) = a < b. + In this case, lower-bound search returns the first index where the value is + >= the view entry. + + The type of the predicate function must be equivalent to the following: + \verbatim + bool operator(const T &a, const T&b); + \endverbatim + KokkosKernels_Predicates.hpp defines a variety of common predicates, + available in KokkosKernels namespace. + + Examples: + \verbatim + value = 3 + view = {0,1,2,3,4} + = {t,t,t,f,f} + result = 3 + + value = -1 + view = {0,1,2,3,4} + = {f,f,f,f,f} + result = 0 + + value = 5 + view = {0,1,2,3,4} + = {t,t,t,t,t} + result = 5 + + value = 1 + view = {0,1,1,1,2} + = {t,f,f,f,f} + result = 1 + \endverbatim + + Contrast with upper-bound, which returns first index for which pred(value, + element) is true + */ + +#include + +#include "KokkosKernels_Predicates.hpp" +#include "KokkosKernels_SimpleUtils.hpp" + +namespace KokkosKernels { +namespace Impl { + +/*! \brief Single-thread sequential lower-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(element, value) is false, + or view.size if no such element exists + + At most view.size() predicate function calls +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type +lower_bound_sequential_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + using size_type = typename ViewLike::size_type; + static_assert(1 == ViewLike::rank, + "lower_bound_sequential_thread requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_sequential_thread requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + size_type i = 0; + while (i < view.size() && pred(view(i), value)) { + ++i; + } + return i; +} + +/*! \brief Single-thread binary lower-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(element, value) is false, + or view.size if no such element exists + + At most log2(view.size()) + 1 predicate function calls +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + using size_type = typename ViewLike::size_type; + static_assert(1 == ViewLike::rank, + "lower_bound_binary_thread requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_binary_thread requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + size_type lo = 0; + size_type hi = view.size(); + while (lo < hi) { + size_type mid = (lo + hi) / 2; + const auto &ve = view(mid); + if (pred(ve, value)) { // mid satisfies predicate, look in higher half not + // including mid + lo = mid + 1; + } else { + hi = mid; + } + } + return lo; +} + +} // namespace Impl + +/*! \brief single-thread lower-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(element, value) is false, + or view.size if no such element exists + + This minimizes the calls to predicate: + for view.size() >= 8, this does a binary search, otherwise, a linear search +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, + "lower_bound_thread requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_thread requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + /* + sequential search makes on average 0.5 * view.size memory accesses + binary search makes log2(view.size)+1 accesses + + log2(x) <= 0.5x roughly when x >= 8 + */ + if (view.size() >= 8) { + return Impl::lower_bound_binary_thread(view, value, pred); + } else { + return Impl::lower_bound_sequential_thread(view, value, pred); + } +} + +namespace Impl { + +/*! \brief Team-collaborative sequential lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred The type of the predicate function to call + + \param handle The Kokkos team handle + \param view The view-like to search + \param value The value to compare in the predicate + \param lo The first index to search + \param hi One-past the last index to search + \param pred Apply pred(view(i), value) + + \returns To all team members, the smallest i for which pred(view(i), value) + is false for i in [lo, hi), or hi if no such value + + Uses a single thread to call \c lower_bound_thread, and broadcasts that + to all team members. +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_single_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + typename ViewLike::size_type idx; + Kokkos::single( + Kokkos::PerTeam(handle), + [&](typename ViewLike::size_type &lidx) { + lidx = KokkosKernels::lower_bound_thread(view, value, pred); + }, + idx); + return idx; +} + +/*! \brief Team-collaborative sequential lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred The type of the predicate function to call + + \param handle The Kokkos team handle + \param view The view-like to search + \param value The value to compare in the predicate + \param lo The first index to search + \param hi One-past the last index to search + \param pred Apply pred(view(i), value) + + \returns To all team members, the smallest i for which pred(view(i), value) + is false for i in [lo, hi), or hi if no such value + + Apply pred(view(i), value) for i in [lo, hi) +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, + typename ViewLike::size_type lo, typename ViewLike::size_type hi, + Pred pred = Pred()) { + using size_type = typename ViewLike::size_type; + static_assert(1 == ViewLike::rank, + "lower_bound_sequential_team requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_sequential_team requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + if (lo == hi) { + return hi; + } + size_type teamI; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(handle, lo, hi), + [&](const size_type &i, size_type &li) { + li = KOKKOSKERNELS_MACRO_MIN(li, hi); + if (i < li) { // no need to search higher than the smallest so far + if (!pred(view(i), value)) { // look for the smallest index that does + // not satisfy + li = i; + } + } + }, + Kokkos::Min(teamI)); + return teamI; +} + +/*! \brief Team-collaborative sequential lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred The type of the predicate function to call + + \param handle The Kokkos team handle + \param view The view-like to search + \param value The value to compare in the predicate + \param pred Apply pred(view(i), value) + + \returns To all team members, the smallest i for which pred(view(i), value) + is false or view.size() if no such value +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + return lower_bound_sequential_team(handle, view, value, 0, view.size(), pred); +} + +/*! \brief A range for the k-ary lower bound search + + The RangeReducer will maximize the lower bound and + minimize the upper bound +*/ +template +struct Range { + T lb; /// lower-bound + T ub; /// upper-bound + + KOKKOS_INLINE_FUNCTION + Range() { init(); } + + KOKKOS_INLINE_FUNCTION + constexpr Range(const T &_lb, const T &_ub) : lb(_lb), ub(_ub) {} + + KOKKOS_INLINE_FUNCTION + void init() { + lb = Kokkos::Experimental::finite_min_v; // will be max'd + ub = Kokkos::Experimental::finite_max_v; // will be min'd + } +}; + +/// \brief maximizes the lower bound, and minimizes the upper bound of a Range +template +struct RangeReducer { + using reducer = RangeReducer; + using value_type = Range; + using result_view_type = + Kokkos::View *, Space, Kokkos::MemoryUnmanaged>; + + private: + value_type &value; + + public: + KOKKOS_INLINE_FUNCTION + RangeReducer(value_type &value_) : value(value_) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type &dst, const value_type &src) const { + dst.lb = KOKKOSKERNELS_MACRO_MAX(dst.lb, src.lb); + dst.ub = KOKKOSKERNELS_MACRO_MIN(dst.ub, src.ub); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type &val) const { val.init(); } + + KOKKOS_INLINE_FUNCTION + value_type &reference() const { return value; } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result_view_type(&value, 1); } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return true; } +}; + +/*! \brief team-collaborative K-ary lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred the binary predicate function type + + Actually, K+1-ary, where K is the size of the team + Split the view into k+1 segments at K points + Evalute the predicate in parallel at each point and use a joint min-max + parallel reduction: + * The lower bound is after the max index where the predicate was true + * The upper bound is no greater than the min index where the predicate was + false Once there are fewer values left than threads in the team, switch to + team sequential search +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, + "lower_bound_kary_team requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_kary_team requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + using size_type = typename ViewLike::size_type; + + size_type lo = 0; + size_type hi = view.size(); + while (lo < hi) { + // if fewer than team_size elements left, just hit them all sequentially + if (lo + handle.team_size() >= hi) { + return lower_bound_sequential_team(handle, view, value, lo, hi, pred); + } + + // otherwise, split the region up among threads + size_type mid = + lo + (hi - lo) * (handle.team_rank() + 1) / (handle.team_size() + 1); + auto ve = view(mid); + + // reduce across threads to figure out where the new search bounds are + // if a thread satisfies the predicate, the first element that does not + // satisfy must be after that thread's search point. we want the max such + // point across all threads if a thread does not satisfy the predicate, the + // first element that does not satisfy must be before or equal. we want the + // min such point across all threads + Range teamRange; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(handle, 0, handle.team_size()), + [&](const int &, Range &lr) { + lr.lb = KOKKOSKERNELS_MACRO_MAX(lo, lr.lb); // no lower than lo + lr.ub = KOKKOSKERNELS_MACRO_MIN(hi, lr.ub); // no higher than hi + // if pred(view(mid), value), then the lower bound is above this + if (pred(ve, value)) { + lr.lb = mid + 1; + } else { // otherwise the lower bound is no larger than this + lr.ub = mid; + } + }, + RangeReducer(teamRange)); + + // next iteration, search in the newly-discovered window + hi = teamRange.ub; + lo = teamRange.lb; + } + return lo; +} + +} // namespace Impl + +/*! \brief Team-collaborative lower-bound search + + \tparam TeamMember the team policy member type the Kokkos team handle + \tparam View the type of view + \tparam Pred the type of the predicate + + \param handle a Kokkos team handle + \param view a Kokkos::View to search + \param value the value to search for + \param pred the predicate to test entries in the view + + \returns The smallest i in range [0, view.size()) for which pred(view(i), + value) is not true, or view.size() if no such `i` exists + + default pred is `element < value`, i.e. return the index to the first + element in the view that does not satisfy `element < value`. For well-ordered + types this is the first element where element >= value + + Pred should be a binary function comparing two `typename + View::non_const_value_type` +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, "lower_bound_team requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_team requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + /* kary search is A = (k-1) * (logk(view.size()) + 1) accesses + + sequential search is B = view.size() accesses + + A < B is true ruoughly when view.size() > 3 * k + */ + if (view.size() > 3 * size_t(handle.team_size())) { + return Impl::lower_bound_kary_team(handle, view, value, pred); + } else { + return Impl::lower_bound_sequential_team(handle, view, value, pred); + } +} + +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_LOWERBOUND_HPP \ No newline at end of file diff --git a/common/src/KokkosKernels_Predicates.hpp b/common/src/KokkosKernels_Predicates.hpp new file mode 100644 index 0000000000..a741d1353a --- /dev/null +++ b/common/src/KokkosKernels_Predicates.hpp @@ -0,0 +1,167 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_PREDICATES_HPP +#define _KOKKOSKERNELS_PREDICATES_HPP + +#include "Kokkos_ArithTraits.hpp" + +/*! \file KokkosKernels_Predicates.hpp + * Define predicates for KokkosKernels search functions + */ + +namespace KokkosKernels { + +/** + * @brief Struct template for a greater-than predicate + * @tparam T Type to be compared + */ +template +struct GT { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /** + * @brief Return true if a is greater than b + * @param a First value to be compared + * @param b Second value to be compared + */ + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a > b; + } +}; + +/*! \brief "Greater-than-or-equal" predicate, a >= b + \tparam T the type to compare +*/ +template +struct GTE { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /// \brief return a >= b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a >= b; + } +}; + +/*! \brief "Less-than" predicate, a < b + \tparam T the type to compare +*/ +template +struct LT { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /// \brief return a < b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a < b; + } +}; + +/*! \brief "Less-than-or-equal" predicate, a <= b + \tparam T the type to compare +*/ +template +struct LTE { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /// \brief return a <= b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a <= b; + } +}; + +/*! \brief "Equal" predicate, a == b + \tparam T the type to compare +*/ +template +struct Equal { + using value_type = T; + + /// \brief return a == b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const { + return a == b; + } +}; + +/** + * @brief Struct template for inverting a predicate + * @tparam Pred Predicate type to be inverted + */ +template +struct Neg { + using value_type = typename Pred::value_type; + + /** + * @brief Constructor + * @param pred Predicate object to be inverted + */ + KOKKOS_INLINE_FUNCTION + constexpr Neg(const Pred &pred) : pred_(pred) {} + + /** + * @brief Return the boolean inverse of the underlying predicate + * @param a First value to be compared by the predicate + * @param b Second value to be compared by the predicate + * @return Boolean inverse of the result of the predicate applied to a and b + */ + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const { + return !pred_(a, b); + } + + private: + Pred pred_; //< Underlying predicate object +}; + +/*! \brief Reflect a predicate, pred(b, a) + \tparam Pred the type of the predicate to reflect +*/ +template +struct Refl { + using value_type = typename Pred::value_type; + + KOKKOS_INLINE_FUNCTION + constexpr Refl(const Pred &pred) : pred_(pred) {} + + /// \brief return the underlying binary predicate with reversed arguments + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const { + return pred_(b, a); + } + + private: + Pred pred_; +}; + +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_PREDICATES_HPP \ No newline at end of file diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp new file mode 100644 index 0000000000..cd2333b3ec --- /dev/null +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -0,0 +1,154 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP +#define _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP + +#include "KokkosKernels_config.h" +#include "KokkosKernels_TplsVersion.hpp" +#include + +namespace KokkosKernels { +namespace Impl { + +inline void print_cublas_version_if_enabled(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: " << cublas_version_string() << "\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; +#endif +} + +inline void print_cusparse_version_if_enabled(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << cusparse_version_string() + << "\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; +#endif +} +inline void print_enabled_tpls(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_BLAS: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_BLAS: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CBLAS + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACKE + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_SUPERLU + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MKL: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MKL: no\n"; +#endif + print_cublas_version_if_enabled(os); + print_cusparse_version_if_enabled(os); +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_METIS + os << "KOKKOSKERNELS_ENABLE_TPL_METIS: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_METIS: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: no\n"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: no\n"; +#endif +} + +inline void print_version(std::ostream& os) { + // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros + // are not available in Kernels + os << " " + << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION_MAJOR << "." + << KOKKOSKERNELS_VERSION_MINOR << "." << KOKKOSKERNELS_VERSION_PATCH + << '\n'; +} + +} // namespace Impl + +inline void print_configuration(std::ostream& os) { + Impl::print_version(os); + + os << "TPLs: \n"; + Impl::print_enabled_tpls(os); +} + +} // namespace KokkosKernels +#endif // _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP diff --git a/common/src/KokkosKernels_PrintUtils.hpp b/common/src/KokkosKernels_PrintUtils.hpp index eff4f1f43f..74b32c793a 100644 --- a/common/src/KokkosKernels_PrintUtils.hpp +++ b/common/src/KokkosKernels_PrintUtils.hpp @@ -104,7 +104,8 @@ inline std::enable_if_t= 2> kk_print_1Dview( return; } os << "[" << view.extent(0); - for (int i = 1; i < idx_array_type::rank; ++i) { + // ::rank is a Kokkos::...::integral_constant, not appropriate for `i` + for (int i = 1; i < int(idx_array_type::rank); ++i) { os << "x" << view.extent(i); } os << " multi-vector]" << std::endl; diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index baefbe8c35..a271695246 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -22,7 +22,7 @@ #define KOKKOSKERNELS_MACRO_MIN(x, y) ((x) < (y) ? (x) : (y)) #define KOKKOSKERNELS_MACRO_MAX(x, y) ((x) < (y) ? (y) : (x)) #define KOKKOSKERNELS_MACRO_ABS(x) \ - Kokkos::Details::ArithTraits::type>::abs(x) + Kokkos::ArithTraits::type>::abs(x) namespace KokkosKernels { @@ -38,7 +38,7 @@ class SquareRootFunctor { KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const { typedef typename ViewType::value_type value_type; - theView_(i) = Kokkos::Details::ArithTraits::sqrt(theView_(i)); + theView_(i) = Kokkos::ArithTraits::sqrt(theView_(i)); } private: @@ -79,22 +79,38 @@ struct InclusiveParallelPrefixSum { /*** * \brief Function performs the exclusive parallel prefix sum. That is each - * entry holds the sum until itself. \param num_elements: size of the array + * entry holds the sum until itself. + * \param exec: the execution space instance on which to run + * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. */ template inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr) { + const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr) { typedef Kokkos::RangePolicy my_exec_space; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr)); } +/*** + * \brief Function performs the exclusive parallel prefix sum. That is each + * entry holds the sum until itself. + * \param num_elements: size of the array + * \param arr: the array for which the prefix sum will be performed. + */ +template +inline void kk_exclusive_parallel_prefix_sum( + typename view_t::value_type num_elements, view_t arr) { + kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr); +} + /*** * \brief Function performs the exclusive parallel prefix sum. That is each * entry holds the sum until itself. This version also returns the final sum * equivalent to the sum-reduction of arr before doing the scan. + * \param exec: the execution space instance on which to run * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. * \param finalSum: will be set to arr[num_elements - 1] after computing the @@ -102,14 +118,30 @@ inline void kk_exclusive_parallel_prefix_sum( */ template inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr, - typename view_t::non_const_value_type &finalSum) { + const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr, typename view_t::non_const_value_type &finalSum) { typedef Kokkos::RangePolicy my_exec_space; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr), finalSum); } +/*** + * \brief Function performs the exclusive parallel prefix sum. That is each + * entry holds the sum until itself. This version also returns the final sum + * equivalent to the sum-reduction of arr before doing the scan. + * \param num_elements: size of the array + * \param arr: the array for which the prefix sum will be performed. + * \param finalSum: will be set to arr[num_elements - 1] after computing the + * prefix sum. + */ +template +inline void kk_exclusive_parallel_prefix_sum( + typename view_t::value_type num_elements, view_t arr, + typename view_t::non_const_value_type &finalSum) { + kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); +} + /*** * \brief Function performs the inclusive parallel prefix sum. That is each * entry holds the sum until itself including itself. \param num_elements: size @@ -219,7 +251,7 @@ inline void kk_reduce_view2(size_t num_elements, view_t arr, } template ::mag_type> struct IsIdenticalFunctor { view_type1 view1; @@ -232,7 +264,7 @@ struct IsIdenticalFunctor { KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, size_t &is_equal) const { typedef typename view_type2::non_const_value_type val_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; typedef typename KAT::mag_type mag_type; const mag_type val_diff = KAT::abs(view1(i) - view2(i)); @@ -266,7 +298,7 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { } template ::mag_type> struct IsRelativelyIdenticalFunctor { view_type1 view1; @@ -380,6 +412,49 @@ KOKKOS_FORCEINLINE_FUNCTION Value xorshiftHash(Value v) { : static_cast(x * 2685821657736338717ULL - 1); } +struct ViewHashFunctor { + ViewHashFunctor(const uint8_t *data_) : data(data_) {} + + KOKKOS_INLINE_FUNCTION void operator()(size_t i, uint32_t &lhash) const { + // Compute a hash/digest of both the index i, and data[i]. Then add that to + // overall hash. + uint32_t x = uint32_t(i); + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + x ^= uint32_t(data[i]); + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + lhash += x; + } + + const uint8_t *data; +}; + +/// \brief Compute a hash of a view. +/// \param v: the view to hash. Must be contiguous, and its element type must +/// not contain any padding bytes. +template +uint32_t hashView(const View &v) { + assert(v.span_is_contiguous()); + // Note: This type trait is supposed to be part of C++17, + // but it's not defined on Intel 19 (with GCC 7.2.0 standard library). + // So just check if it's available before using. +#ifdef __cpp_lib_has_unique_object_representations + static_assert(std::has_unique_object_representations< + typename View::non_const_value_type>::value, + "KokkosKernels::Impl::hashView: the view's element type must " + "not have any padding bytes."); +#endif + size_t nbytes = v.span() * sizeof(typename View::value_type); + uint32_t h; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nbytes), + ViewHashFunctor(reinterpret_cast(v.data())), h); + return h; +} + template struct SequentialFillFunctor { using size_type = typename V::size_type; diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp new file mode 100644 index 0000000000..38de7c1399 --- /dev/null +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -0,0 +1,57 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_TPLS_VERSIONS_HPP +#define _KOKKOSKERNELS_TPLS_VERSIONS_HPP + +#include "KokkosKernels_config.h" +#include + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +#include "cublas_v2.h" +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#include "cusparse.h" +#endif + +namespace KokkosKernels { + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +inline std::string cublas_version_string() { + // Print version + std::stringstream ss; + + ss << CUBLAS_VER_MAJOR << "." << CUBLAS_VER_MINOR << "." << CUBLAS_VER_PATCH; + + return ss.str(); +} +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +inline std::string cusparse_version_string() { + // Print version + std::stringstream ss; + + ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." + << CUSPARSE_VER_PATCH << "." << CUSPARSE_VER_BUILD; + + return ss.str(); +} +#endif + +} // namespace KokkosKernels +#endif // _KOKKOSKERNELS_TPLS_VERSIONS_HPP diff --git a/common/src/KokkosKernels_UpperBound.hpp b/common/src/KokkosKernels_UpperBound.hpp new file mode 100644 index 0000000000..901c865743 --- /dev/null +++ b/common/src/KokkosKernels_UpperBound.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_UPPERBOUND_HPP +#define _KOKKOSKERNELS_UPPERBOUND_HPP + +/*! \file KokkosKernels_UpperBound.hpp + Define thread and team-collaborative upper-bound search + + Upper-bound search takes a Kokkos::View, a search value, and a binary + predicate. + It returns an index to the first element of the view such that pred(value, + element) is true + + This is implemented by calling lower_bound functions with inverted and + reflected predicates, i.e. upper_bound(view, val, pred) = lower_bound(value, + val, Inv(Refl(pred))); + + Examples: + \verbatim + value = 3 + view = {0,1,2,3,4} + = {f,f,f,f,t} + result = 4 + + value = -1 + view = {0,1,2,3,4} + = {t,t,t,t,t} + result = 0 + + value = 5 + view = {0,1,2,3,4} + = {f,f,f,f,f} + result = 5 + + value = 1 + view = {0,1,1,1,2} + = {f,f,f,f,t} + result = 4 + \endverbatim + + Contrast with lower-bound, which returns first index for which pred(element, + value) is false + */ + +#include "KokkosKernels_LowerBound.hpp" + +namespace KokkosKernels { + +/*! \brief single-thread upper-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(value,element) is true, + or view.size if no such element exists +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + return lower_bound_thread(view, value, Neg(Refl(pred))); +} + +/*! \brief team-collaborative upper-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(value,element) is true, + or view.size if no such element exists +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + return lower_bound_team(handle, view, value, Neg(Refl(pred))); +} + +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_UPPERBOUND_HPP \ No newline at end of file diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index fd04bd2529..2a4b749f92 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -837,7 +837,7 @@ template void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(vector, Kokkos::Details::ArithTraits::zero()); + Kokkos::deep_copy(vector, Kokkos::ArithTraits::zero()); } template diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 9210264b61..672bdf3fbb 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -65,7 +65,7 @@ using default_device = Kokkos::Experimental::HIP; using default_device = Kokkos::Experimental::OpenMPTarget; #elif defined(KOKKOS_ENABLE_OPENMP) using default_device = Kokkos::OpenMP; -#elif defined(KOKKOS_ENABLE_PTHREAD) || defined(KOKKOS_ENABLE_THREADS) +#elif defined(KOKKOS_ENABLE_THREADS) using default_device = Kokkos::Threads; #else using default_device = Kokkos::Serial; diff --git a/common/src/KokkosKernels_helpers.hpp b/common/src/KokkosKernels_helpers.hpp index a7a1882700..b36360b991 100644 --- a/common/src/KokkosKernels_helpers.hpp +++ b/common/src/KokkosKernels_helpers.hpp @@ -19,6 +19,8 @@ #include "KokkosKernels_config.h" // KOKKOSKERNELS_INST_LAYOUTLEFT, KOKKOSKERNELS_INST_LAYOUTRIGHT #include "KokkosKernels_default_types.hpp" // default_layout +#include + namespace KokkosKernels { namespace Impl { @@ -67,6 +69,13 @@ struct GetUnifiedScalarViewType { type; }; +template +struct are_integral : std::bool_constant<((std::is_integral_v || + std::is_enum_v)&&...)> {}; + +template +inline constexpr bool are_integral_v = are_integral::value; + } // namespace Impl } // namespace KokkosKernels #endif diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 98ac27f1c9..31744f7a8f 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -18,7 +18,7 @@ #define KOKKOS_ARITHTRAITS_HPP /// \file Kokkos_ArithTraits.hpp -/// \brief Declaration and definition of Kokkos::Details::ArithTraits +/// \brief Declaration and definition of Kokkos::ArithTraits #include #include @@ -195,7 +195,6 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, } // namespace namespace Kokkos { -namespace Details { // Macro to automate the wrapping of Kokkos Mathematical Functions // in the ArithTraits struct for real floating point types, hopefully @@ -2043,13 +2042,12 @@ struct [[deprecated]] ArithTraits { }; #endif // HAVE_KOKKOS_QD -} // namespace Details +namespace Details { +template +using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = + ::Kokkos::ArithTraits; -// Promote ArithTraits into Kokkos namespace. At some point, we -// will remove it from the Details namespace completely. We leave -// it there for now, because a lot of code depends on it being -// there. -using Details::ArithTraits; +} // namespace Details } // namespace Kokkos #endif // KOKKOS_ARITHTRAITS_HPP diff --git a/common/src/Kokkos_InnerProductSpaceTraits.hpp b/common/src/Kokkos_InnerProductSpaceTraits.hpp index 072125115c..c2bc475c45 100644 --- a/common/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/common/src/Kokkos_InnerProductSpaceTraits.hpp @@ -105,7 +105,7 @@ namespace Details { /// /// \section Kokkos_IPST_new Adding a specialization for a new type T /// -/// You must first add a specialization of ArithTraits. Please +/// You must first add a specialization of Kokkos::ArithTraits. Please /// note that if CUDA does not support using T in device functions, /// then you must not mark norm() or dot() as device functions /// in your specialization. (Simply omit the KOKKOS_FORCEINLINE_FUNCTION @@ -119,14 +119,14 @@ class InnerProductSpaceTraits { typedef T val_type; //! The type returned by norm(x) for a value x of type val_type. - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; //! The type returned by dot(x,y) for values x and y of type val_type. typedef val_type dot_type; //! The "norm" (absolute value or magnitude) of a value x of type val_type. static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } /// \brief The "dot product" of two values x and y of type val_type. /// @@ -146,11 +146,11 @@ class InnerProductSpaceTraits { template <> struct InnerProductSpaceTraits { typedef long double val_type; - typedef ArithTraits::mag_type mag_type; + typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -160,11 +160,11 @@ template class InnerProductSpaceTraits> { public: typedef Kokkos::complex val_type; - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, const val_type& y) { @@ -179,11 +179,11 @@ class InnerProductSpaceTraits> { template struct InnerProductSpaceTraits> { typedef std::complex val_type; - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return std::conj(x) * y; @@ -200,11 +200,11 @@ struct InnerProductSpaceTraits> { template <> struct InnerProductSpaceTraits<__float128> { typedef __float128 val_type; - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -223,17 +223,17 @@ struct InnerProductSpaceTraits<__float128> { // functions. It should be possible to use Kokkos' support for // aggregate types to implement device function support for dd_real // and qd_real, but we have not done this yet (as of 07 Jan 2014). -// Hence, the class methods of the ArithTraits specializations for +// Hence, the class methods of the Kokkos::ArithTraits specializations for // dd_real and qd_real are not marked as device functions. #ifdef HAVE_KOKKOS_QD template <> struct InnerProductSpaceTraits { typedef dd_real val_type; - typedef ArithTraits::mag_type mag_type; + typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -241,11 +241,11 @@ struct InnerProductSpaceTraits { template <> struct InnerProductSpaceTraits { typedef qd_real val_type; - typedef ArithTraits::mag_type mag_type; + typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index dd368f009b..2ccf9c2103 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -23,5 +23,9 @@ #include #include #include +#include +#include +#include +#include #endif // TEST_COMMON_HPP diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 29d0498055..8aa963b2ab 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -15,15 +15,15 @@ //@HEADER /// \file ArithTraitsTest.hpp -/// \brief Templated test for Kokkos::Details::ArithTraits +/// \brief Templated test for Kokkos::ArithTraits /// /// This header file is an implementation detail of the tests for -/// Kokkos::Details::ArithTraits. Users must not rely on it existing, +/// Kokkos::ArithTraits. Users must not rely on it existing, /// or on its contents. This header file should not be /// installed with Kokkos' other header files. /// /// On the other hand, this header file does give examples of how to -/// use Kokkos::Details::ArithTraits, so it may be useful for users to +/// use Kokkos::ArithTraits, so it may be useful for users to /// read it. #ifndef KOKKOS_ARITHTRAITSTEST_HPP @@ -51,7 +51,7 @@ #endif namespace { -// Whether Kokkos::Details::ArithTraits implements +// Whether Kokkos::ArithTraits implements // transcendental functions. These include sqrt, pow, log, and // log10. template @@ -92,8 +92,8 @@ struct HasTranscendentals { } // namespace /// \class ArithTraitsTesterBase -/// \brief Base class providing tests for Kokkos::Details::ArithTraits -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Base class providing tests for Kokkos::ArithTraits +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization, and which can be executed on the parallel /// device. /// \tparam DeviceType A Kokkos parallel device type. @@ -107,8 +107,8 @@ struct HasTranscendentals { /// types. /// /// This class provides a Kokkos reduction operator for testing -/// Kokkos::Details::ArithTraits. This test works for any type -/// ScalarType for which Kokkos::Details::ArithTraits has a +/// Kokkos::ArithTraits. This test works for any type +/// ScalarType for which Kokkos::ArithTraits has a /// specialization, and which can be executed on the parallel device. /// /// The tests include those suitable for execution on the parallel @@ -162,7 +162,7 @@ class ArithTraitsTesterBase { KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // not using this argument int success = 1; @@ -273,7 +273,7 @@ class ArithTraitsTesterBase { /// /// \return \c 1 if all the tests pass, else \c 0. int testHost(std::ostream& out) const { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; using std::endl; int success = 1; @@ -378,7 +378,7 @@ class ArithTraitsTesterBase { /// \brief Base class of ArithTraitsTester that exercises /// transcendental functions, if and only if ArithTraits /// implements them. -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// implements transcendental functions, along with the requirements /// imposed by ArithTraitsTesterBase. /// \tparam DeviceType A Kokkos parallel device type. @@ -441,7 +441,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - // typedef Kokkos::Details::ArithTraits AT; + // typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -462,7 +462,7 @@ class ArithTraitsTesterTranscendentalBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - // typedef Kokkos::Details::ArithTraits AT; + // typedef Kokkos::ArithTraits AT; int success = 1; if (HasTranscendentals::value) { @@ -495,20 +495,16 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_INLINE_FUNCTION bool equal(const ScalarType& a, const ScalarType& b) const { - if (b != Kokkos::Details::ArithTraits::zero()) { + if (b != Kokkos::ArithTraits::zero()) { if (a > b) - return (a - b) / b < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (a - b) / b < 2 * Kokkos::ArithTraits::epsilon(); else - return (b - a) / b < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (b - a) / b < 2 * Kokkos::ArithTraits::epsilon(); } else { if (a > b) - return (a - b) < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (a - b) < 2 * Kokkos::ArithTraits::epsilon(); else - return (b - a) < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (b - a) < 2 * Kokkos::ArithTraits::epsilon(); } } @@ -524,7 +520,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -733,7 +729,7 @@ class ArithTraitsTesterTranscendentalBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int success = 1; if (!HasTranscendentals::value) { @@ -946,7 +942,7 @@ class ArithTraitsTesterTranscendentalBase }; /// \class ArithTraitsTesterComplexBase -/// \brief Execute Kokkos::Details::ArithTraits tests relevant to +/// \brief Execute Kokkos::ArithTraits tests relevant to /// complex numbers (whether or not \c ScalarType is itself a /// complex-valued type). /// @@ -958,8 +954,7 @@ class ArithTraitsTesterTranscendentalBase /// complex, but the specific tests that are run will depend on /// ScalarType. template ::is_complex> + const int is_complex = Kokkos::ArithTraits::is_complex> class ArithTraitsTesterComplexBase : public ArithTraitsTesterTranscendentalBase { private: @@ -1009,7 +1004,7 @@ class ArithTraitsTesterComplexBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1048,7 +1043,7 @@ class ArithTraitsTesterComplexBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int success = 1; // Apparently, std::numeric_limits::is_signed is 1 only for real @@ -1095,7 +1090,7 @@ class ArithTraitsTesterComplexBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1103,7 +1098,7 @@ class ArithTraitsTesterComplexBase FAILURE(); } typedef typename AT::mag_type mag_type; - const mag_type one = Kokkos::Details::ArithTraits::one(); + const mag_type one = Kokkos::ArithTraits::one(); // This presumes that ScalarType, being a complex number, has a // constructor which takes two mag_type arguments. @@ -1129,7 +1124,7 @@ class ArithTraitsTesterComplexBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int success = 1; if (!AT::is_complex) { @@ -1137,7 +1132,7 @@ class ArithTraitsTesterComplexBase FAILURE(); } typedef typename AT::mag_type mag_type; - const mag_type one = Kokkos::Details::ArithTraits::one(); + const mag_type one = Kokkos::ArithTraits::one(); // This presumes that ScalarType, being a complex number, has a // constructor which takes two mag_type arguments. @@ -1173,7 +1168,7 @@ class ArithTraitsTesterComplexBase /// \tparam DeviceType A Kokkos parallel device type. /// /// Kokkos reduction operator for testing those attributes of -/// Kokkos::Details::ArithTraits relevant to floating-point types. +/// Kokkos::ArithTraits relevant to floating-point types. /// /// The tests include those suitable for execution on the parallel /// device (operator()) and those suitable for execution on the host @@ -1181,17 +1176,14 @@ class ArithTraitsTesterComplexBase /// executions of the test. All redundant executions must return /// '1' (passed). template ::is_exact> + const int is_exact = Kokkos::ArithTraits::is_exact> class ArithTraitsTesterFloatingPointBase : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> { + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { private: //! The base class of this class. typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> base_type; public: @@ -1217,13 +1209,11 @@ class ArithTraitsTesterFloatingPointBase template class ArithTraitsTesterFloatingPointBase : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> { + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { private: //! The base class of this class. typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> base_type; public: @@ -1238,7 +1228,7 @@ class ArithTraitsTesterFloatingPointBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1284,7 +1274,7 @@ class ArithTraitsTesterFloatingPointBase protected: virtual int testHostImpl(std::ostream& out) const { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; using std::endl; int success = 1; @@ -1338,13 +1328,11 @@ class ArithTraitsTesterFloatingPointBase template class ArithTraitsTesterFloatingPointBase : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> { + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { private: //! The base class of this class. typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> base_type; public: @@ -1359,7 +1347,7 @@ class ArithTraitsTesterFloatingPointBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1380,7 +1368,7 @@ class ArithTraitsTesterFloatingPointBase protected: virtual int testHostImpl(std::ostream& out) const { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; using std::endl; int success = 1; @@ -1399,8 +1387,8 @@ class ArithTraitsTesterFloatingPointBase }; /// \class ArithTraitsTester -/// \brief Tests for Kokkos::Details::ArithTraits -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Tests for Kokkos::ArithTraits +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization, and which can be executed on the parallel /// device. /// \tparam DeviceType A Kokkos parallel device type. @@ -1415,9 +1403,9 @@ class ArithTraitsTesterFloatingPointBase /// for host functions do use run-time polymorphism. /// /// This class (through its base class) provides a Kokkos reduction -/// operator for testing Kokkos::Details::ArithTraits. This test +/// operator for testing Kokkos::ArithTraits. This test /// works for any type ScalarType for which -/// Kokkos::Details::ArithTraits has a specialization, and which can +/// Kokkos::ArithTraits has a specialization, and which can /// be executed on the parallel device. /// /// The tests include those suitable for execution on the parallel @@ -1438,8 +1426,8 @@ class ArithTraitsTester KOKKOS_INLINE_FUNCTION ArithTraitsTester() {} }; -/// \brief Run the Kokkos::Details::ArithTraits tests on the parallel device. -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Run the Kokkos::ArithTraits tests on the parallel device. +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization, and which can be executed on the parallel /// device. /// \tparam DeviceType A Kokkos parallel device type. @@ -1457,17 +1445,15 @@ int testArithTraitsOnDevice(std::ostream& out, const int verbose) { functor_type(), success); if (success) { if (verbose) - out << Kokkos::Details::ArithTraits::name() << " passed" - << endl; + out << Kokkos::ArithTraits::name() << " passed" << endl; } else { - out << Kokkos::Details::ArithTraits::name() << " FAILED" - << endl; + out << Kokkos::ArithTraits::name() << " FAILED" << endl; } return success; } -/// \brief Run the Kokkos::Details::ArithTraits tests on the host. -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Run the Kokkos::ArithTraits tests on the host. +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization. /// \tparam DeviceType A Kokkos parallel device type. /// @@ -1482,16 +1468,14 @@ int testArithTraitsOnHost(std::ostream& out, const int verbose) { if (localSuccess) { if (verbose) - out << Kokkos::Details::ArithTraits::name() << " passed" - << endl; + out << Kokkos::ArithTraits::name() << " passed" << endl; } else { - out << Kokkos::Details::ArithTraits::name() << " FAILED" - << endl; + out << Kokkos::ArithTraits::name() << " FAILED" << endl; } return localSuccess; } -/// \brief Run the Kokkos::Details::ArithTraits tests for all (valid) +/// \brief Run the Kokkos::ArithTraits tests for all (valid) /// scalar types, on the given parallel device. /// \tparam DeviceType A Kokkos parallel device type. /// @@ -1586,7 +1570,7 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { return success && curSuccess; } -/// \brief Run the Kokkos::Details::ArithTraits tests for all scalar +/// \brief Run the Kokkos::ArithTraits tests for all scalar /// types, on the host. /// \tparam DeviceType A Kokkos parallel device type. /// diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp new file mode 100644 index 0000000000..cae207d56b --- /dev/null +++ b/common/unit_test/Test_Common_Iota.hpp @@ -0,0 +1,126 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef TEST_COMMON_IOTA_HPP +#define TEST_COMMON_IOTA_HPP + +#include + +#include "KokkosKernels_Iota.hpp" + +template +void test_iota_constructor() { + // empty iota + { + Iota i; + EXPECT_EQ(i.size(), 0); + } + + // basic iota + { + Iota ten(10); + EXPECT_EQ(ten.size(), 10); + for (size_t i = 0; i < ten.size(); ++i) { + EXPECT_EQ(ten(i), i); + } + } + + // iota with negative offset + if constexpr (std::is_signed_v) { + Iota three(3, -7); + EXPECT_EQ(three.size(), 3); + for (size_t i = 0; i < three.size(); ++i) { + EXPECT_EQ(three(i), T(i) - T(7)); + } + } + + // iota with positive offset + { + Iota three(3, 2); + EXPECT_EQ(three.size(), 3); + for (size_t i = 0; i < three.size(); ++i) { + EXPECT_EQ(three(i), i + 2); + } + } + + // negative sizes are capped at 0 + if constexpr (std::is_signed_v) { + { + Iota i(-7); + EXPECT_EQ(i.size(), 0); + } + { + Iota i(-1, 2); + EXPECT_EQ(i.size(), 0); + } + } +} + +template +void test_iota_rank() { + EXPECT_EQ((Iota::rank), 1); +} + +template +void test_iota_subview() { + // get the 7th and 8th elements of an Iota + Iota ten(10, 1); // 1..<11 + Iota sub(ten, Kokkos::pair{7, 9}); // 8, 9 + + EXPECT_EQ(sub.size(), 2); + EXPECT_EQ(sub(0), 8); + EXPECT_EQ(sub(1), 9); +} + +template +void test_is_iota() { + static_assert(KokkosKernels::Impl::is_iota_v>, + "Iota should be an Iota"); + static_assert(!KokkosKernels::Impl::is_iota_v, + "int should not be an Iota"); +} + +template +void test_iota() { + test_is_iota(); + test_iota_constructor(); + test_iota_rank(); + test_iota_subview(); +} + +TEST_F(TestCategory, common_iota) { + test_iota(); + test_iota(); + test_iota(); + test_iota(); + + test_iota(); + test_iota(); + test_iota(); + test_iota(); + + test_iota(); + test_iota(); + test_iota(); + test_iota(); + + test_iota(); + test_iota(); + test_iota(); + test_iota(); +} + +#endif // TEST_COMMON_IOTA_HPP diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp new file mode 100644 index 0000000000..3ff27da23c --- /dev/null +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -0,0 +1,258 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_LowerBound.hpp +/// \brief Tests lower bounds search routines + +#include +#include + +template +size_t std_lower_bound(const std::vector &haystack, + const Ordinal needle) { + const auto it = std::lower_bound(haystack.begin(), haystack.end(), needle); + return it - haystack.begin(); +} + +/*! \brief count the number of incorrect values */ +template +struct ThreadLowerBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + ThreadLowerBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i, int &lerrCount) const { + if (0 == i) { + hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); + if (idx != expected_) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(i), + int(expected_), int(idx)); + ++lerrCount; + } + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_lower_bound_thread(const std::vector &_haystack, + const T &_needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::RangePolicy; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using size_type = typename u_const_view_t::size_type; + + // get expected value + const size_type expected = std_lower_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test lower_bound search + int errCount; + // run a single thread + Kokkos::parallel_reduce(Policy(0, 1), + ThreadLowerBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +/*! \brief count the number of incorrect values */ +template +struct TeamLowerBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + TeamLowerBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, + int &lerrCount) const { + hv_size_type idx = + KokkosKernels::lower_bound_team(handle, haystack_, needle_); + if (idx != expected_) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); + ++lerrCount; + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_lower_bound_team(const std::vector &_haystack, const T _needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::TeamPolicy; + using Member = typename Policy::member_type; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using size_type = typename u_const_view_t::size_type; + + // get expected value + const size_type expected = std_lower_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test lower_bound search + const int leagueSize = 1; + const int teamSize = + KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + int errCount; + Kokkos::parallel_reduce( + Policy(leagueSize, teamSize), + TeamLowerBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +template +void test_lower_bound(const std::vector &haystack, const T needle) { + test_lower_bound_thread(haystack, needle); + test_lower_bound_team(haystack, needle); +} + +template +T randn(T n) { + if constexpr (std::is_floating_point_v) { + return T(rand()) / T(RAND_MAX) * n; + } else { + return T(rand()) % n; + } +} + +/* define specific and random lower-bound test cases + */ +template +void test_lower_bound() { + test_lower_bound({}, T(0)); + test_lower_bound({}, T(1)); + test_lower_bound({}, T(-1)); + + test_lower_bound({0}, T(0)); + test_lower_bound({0}, T(1)); + test_lower_bound({0}, T(-1)); + + test_lower_bound({1}, T(0)); + test_lower_bound({1}, T(1)); + test_lower_bound({1}, T(-1)); + + test_lower_bound({T(-1)}, T(0)); + test_lower_bound({T(-1)}, T(1)); + test_lower_bound({T(-1)}, T(-1)); + + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(-1)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(0)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(1)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2.4)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2.5)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2.6)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(3)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(4)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(5)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(6)); + + auto randn = [](T n) -> T { + T ret; + if constexpr (std::is_floating_point_v) { + ret = T(rand()) / T(RAND_MAX) * n; + } else { + ret = T(rand()) % n; + } + return ret; + }; + + T maxEntry = 20; + const int numTests = 100; + for (int n = 0; n < numTests; ++n) { + for (size_t sz : {10, 100, 1000}) { + // generate a sorted random vector + std::vector haystack; + for (size_t i = 0; i < sz; ++i) { + haystack.push_back(randn(maxEntry)); + } + std::sort(haystack.begin(), haystack.end()); + + // generate a random value to search for + const T needle = randn(maxEntry); + + // do the test + test_lower_bound(haystack, needle); + } + } +} + +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##lower_bound##_##T##_##DEVICE) { \ + test_lower_bound(); \ + } + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int64_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(float, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, TestExecSpace) +#endif + +#undef EXECUTE_TEST diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp new file mode 100644 index 0000000000..07a55e152b --- /dev/null +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -0,0 +1,63 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_PrintConfiguration.hpp +/// \brief Tests for print configuration + +#ifndef KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP +#define KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP + +#include "KokkosKernels_PrintConfiguration.hpp" + +/// \brief Verify that all keys from kernels configuration and check their +/// values +void check_print_configuration(const std::ostringstream& msg) { + bool kernelsVersionKeyFound = false; + bool enabledTPLsNamesKeyFound = false; + // Iterate over lines returned from kokkos and extract key:value pairs + std::stringstream ss{msg.str()}; + for (std::string line; std::getline(ss, line, '\n');) { + auto found = line.find_first_of(':'); + if (found != std::string::npos) { + auto currentKey = line.substr(0, found); + if (currentKey == " KokkosKernels Version") { + kernelsVersionKeyFound = true; + } else if (currentKey == "TPLs") { + enabledTPLsNamesKeyFound = true; + } + } + } + EXPECT_TRUE(kernelsVersionKeyFound && enabledTPLsNamesKeyFound); +} + +/// \brief Verify that print_configuration prints the expected keys from Kernels +/// configuration +template +void testPrintConfiguration() { + // First, print this to cout in order to see what it looks like + KokkosKernels::print_configuration(std::cout); + // Then, run the actual test which prints the string to "out" and verifies + // that out has meet some expected behavior + std::ostringstream out; + KokkosKernels::print_configuration(out); + check_print_configuration(out); +} + +TEST_F(TestCategory, common_print_configuration) { + testPrintConfiguration(); +} + +#endif // KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp new file mode 100644 index 0000000000..a6d3b24d84 --- /dev/null +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -0,0 +1,249 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_UpperBound.hpp +/// \brief Tests upper bounds search routines + +#include +#include + +template +size_t std_upper_bound(const std::vector &haystack, + const Ordinal needle) { + const auto it = std::upper_bound(haystack.begin(), haystack.end(), needle); + return it - haystack.begin(); +} + +/*! \brief count the number of incorrect values */ +template +struct ThreadUpperBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + ThreadUpperBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i, int &lerrCount) const { + if (0 == i) { + hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); + if (idx != expected_) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(i), + int(expected_), int(idx)); + ++lerrCount; + } + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_upper_bound_thread(const std::vector &_haystack, + const T &_needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::RangePolicy; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using hv_size_type = typename u_const_view_t::size_type; + + // get expected value + const hv_size_type expected = std_upper_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test upper_bound search + int errCount; + // run a single thread + Kokkos::parallel_reduce(Policy(0, 1), + ThreadUpperBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +/*! \brief count the number of incorrect values */ +template +struct TeamUpperBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + TeamUpperBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, + int &lerrCount) const { + hv_size_type idx = + KokkosKernels::upper_bound_team(handle, haystack_, needle_); + if (idx != expected_) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); + ++lerrCount; + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_upper_bound_team(const std::vector &_haystack, const T _needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::TeamPolicy; + using Member = typename Policy::member_type; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using hv_size_type = typename u_const_view_t::size_type; + + // get expected value + const hv_size_type expected = std_upper_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test upper_bound search + const int leagueSize = 1; + const int teamSize = + KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + int errCount; + Kokkos::parallel_reduce( + Policy(leagueSize, teamSize), + TeamUpperBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +template +void test_upper_bound(const std::vector &haystack, const T needle) { + test_upper_bound_thread(haystack, needle); + test_upper_bound_team(haystack, needle); +} + +/* define specific and random lower-bound test cases + */ +template +void test_upper_bound() { + test_upper_bound({}, T(0)); + test_upper_bound({}, T(1)); + test_upper_bound({}, T(-1)); + + test_upper_bound({0}, T(0)); + test_upper_bound({0}, T(1)); + test_upper_bound({0}, T(-1)); + + test_upper_bound({1}, T(0)); + test_upper_bound({1}, T(1)); + test_upper_bound({1}, T(-1)); + + test_upper_bound({T(-1)}, T(0)); + test_upper_bound({T(-1)}, T(1)); + test_upper_bound({T(-1)}, T(-1)); + + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(-1)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(0)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(1)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2.4)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2.5)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2.6)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(3)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(4)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(5)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(6)); + + auto randn = [](T n) -> T { + T ret; + if constexpr (std::is_floating_point_v) { + ret = T(rand()) / T(RAND_MAX) * n; + } else { + ret = T(rand()) % n; + } + return ret; + }; + + constexpr T maxEntry = 20; + const int numTests = 100; + for (int n = 0; n < numTests; ++n) { + for (size_t sz : {10, 100, 1000}) { + // generate a sorted random vector + std::vector haystack; + for (size_t i = 0; i < sz; ++i) { + haystack.push_back(randn(maxEntry)); + } + std::sort(haystack.begin(), haystack.end()); + + // generate a random value to search for + const T needle = randn(maxEntry); + + // do the test + test_upper_bound(haystack, needle); + } + } +} + +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##upper_bound##_##T##_##DEVICE) { \ + test_upper_bound(); \ + } + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int64_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(float, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, TestExecSpace) +#endif + +#undef EXECUTE_TEST diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 40680b0705..41be4736c3 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -1,10 +1,7 @@ # Source: https://devblogs.microsoft.com/cppblog/clear-functional-c-documentation-with-sphinx-breathe-doxygen-cmake/ # Author: Evan Harvey find_package(Doxygen REQUIRED) -find_package(Sphinx REQUIRED) -set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}) -set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/docs/sphinx) set(KOKKOS_INCLUDE_DIR ${Kokkos_DIR}/../../../include) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/conf.py DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) @@ -36,12 +33,21 @@ add_custom_command(OUTPUT ${DOXYGEN_INDEX_FILE} add_custom_target(Doxygen ALL DEPENDS ${DOXYGEN_INDEX_FILE}) - -add_custom_target(Sphinx ALL - COMMAND ${SPHINX_EXECUTABLE} -b html - # Tell Breathe where to find the Doxygen output - -Dbreathe_projects.${PROJECT_NAME}=${DOXYGEN_OUTPUT_DIR}/xml - ${SPHINX_SOURCE} ${SPHINX_BUILD} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS Doxygen - COMMENT "Generating documentation with Sphinx") +## If we can find sphinx, add that target too +find_package(Sphinx) + +if (Sphinx_FOUND) + set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}) + set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/docs/sphinx) + + add_custom_target(Sphinx ALL + COMMAND ${SPHINX_EXECUTABLE} -W --keep-going -b html + # Tell Breathe where to find the Doxygen output + -Dbreathe_projects.${PROJECT_NAME}=${DOXYGEN_OUTPUT_DIR}/xml + ${SPHINX_SOURCE} ${SPHINX_BUILD} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS Doxygen + COMMENT "Generating documentation with Sphinx") +else() # Sphinx_FOUND + message(STATUS "Sphinx not found. Only Doxygen docs can be built") +endif() # Sphinx_FOUND diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 5cb072a465..954b6b669b 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -606,7 +606,7 @@ HIDE_COMPOUND_REFERENCE= NO # will show which file needs to be included to use the class. # The default value is: YES. -SHOW_HEADERFILE = YES +# SHOW_HEADERFILE = YES # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. @@ -824,7 +824,7 @@ WARN_IF_DOC_ERROR = YES # parameters have no documentation without warning. # The default value is: YES. -WARN_IF_INCOMPLETE_DOC = YES +# WARN_IF_INCOMPLETE_DOC = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return @@ -843,7 +843,7 @@ WARN_NO_PARAMDOC = NO # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. -WARN_AS_ERROR = NO +WARN_AS_ERROR = FAIL_ON_WARNINGS # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which @@ -1571,7 +1571,7 @@ GENERATE_TREEVIEW = NO # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -FULL_SIDEBAR = NO +# FULL_SIDEBAR = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. @@ -1654,7 +1654,7 @@ USE_MATHJAX = NO # The default value is: MathJax_2. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_VERSION = MathJax_2 +# MATHJAX_VERSION = MathJax_2 # When MathJax is enabled you can set the default output format to be used for # the MathJax output. For more details about the output format see MathJax @@ -2199,7 +2199,7 @@ ENABLE_PREPROCESSING = YES # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -MACRO_EXPANSION = NO +MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then # the macro expansion is limited to the macros specified with the PREDEFINED and @@ -2207,7 +2207,7 @@ MACRO_EXPANSION = NO # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_ONLY_PREDEF = NO +EXPAND_ONLY_PREDEF = YES # If the SEARCH_INCLUDES tag is set to YES, the include files in the # INCLUDE_PATH will be searched if a #include is found. @@ -2239,7 +2239,8 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = DOXY +PREDEFINED = DOXY \ + "KOKKOS_INLINE_FUNCTION=" # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/conf.py b/docs/conf.py index ce7385acad..f7027880c5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,4 +79,4 @@ def configureDoxyfile(input_dir, output_dir, doxyfile_in, doxyfile_out): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] diff --git a/docs/developer/apidocs.rst b/docs/developer/apidocs.rst index 82797c5801..a3df431c6a 100644 --- a/docs/developer/apidocs.rst +++ b/docs/developer/apidocs.rst @@ -11,4 +11,5 @@ The source documentation is extracted from the C++ files using Doxygen. apidocs/blas3 apidocs/sparse apidocs/batched_dense + apidocs/batched_dense_host apidocs/batched_sparse \ No newline at end of file diff --git a/docs/developer/apidocs/batched_dense_host.rst b/docs/developer/apidocs/batched_dense_host.rst new file mode 100644 index 0000000000..d6392067b4 --- /dev/null +++ b/docs/developer/apidocs/batched_dense_host.rst @@ -0,0 +1,8 @@ +BATCHED -- KokkosKernels batched host-level interfaces +========================================================= + +BatchedGemm +----------- +.. doxygenfunction:: KokkosBatched::BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) +.. doxygenclass:: KokkosBatched::BatchedGemmHandle + :members: \ No newline at end of file diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index 1a68066271..72c2612c7f 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -1,9 +1,15 @@ BLAS1 -- KokkosKernels blas1 interfaces ======================================= +abs +--- +.. doxygenfunction:: KokkosBlas::abs(const execution_space& space, const RMV& R, const XMV& X) +.. doxygenfunction:: KokkosBlas::abs(const RMV& R, const XMV& X) + axpby ----- -.. doxygenfunction:: KokkosBlas::axpby +.. doxygenfunction:: KokkosBlas::axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, const YMV& Y) +.. doxygenfunction:: KokkosBlas::axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) dot --- @@ -12,11 +18,13 @@ dot fill ---- -.. doxygenfunction:: KokkosBlas::fill +.. doxygenfunction:: KokkosBlas::fill(const execution_space& space, const XMV& X, const typename XMV::non_const_value_type& val) +.. doxygenfunction:: KokkosBlas::fill(const XMV& X, const typename XMV::non_const_value_type& val) mult ---- -.. doxygenfunction:: KokkosBlas::mult +.. doxygenfunction:: KokkosBlas::mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) +.. doxygenfunction:: KokkosBlas::mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) nrm1 ---- @@ -40,21 +48,24 @@ nrminf reciprocal ---------- -.. doxygenfunction:: KokkosBlas::reciprocal +.. doxygenfunction:: KokkosBlas::reciprocal(const execution_space& space, const RMV& R, const XMV& X) +.. doxygenfunction:: KokkosBlas::reciprocal(const RMV& R, const XMV& X) scal ---- -.. doxygenfunction:: KokkosBlas::scal +.. doxygenfunction:: KokkosBlas::scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) +.. doxygenfunction:: KokkosBlas::scal(const RMV& R, const AV& a, const XMV& X) sum --- .. doxygenfunction:: KokkosBlas::sum(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) swap ---- -.. doxygenfunction:: KokkosBlas::swap(execution_space const& space, XVector const& X, YVector const& Y) -.. doxygenfunction:: KokkosBlas::swap(XVector const& X, YVector const& Y) +---- +.. doxygenfunction:: KokkosBlas::swap(execution_space const&, XVector const&, YVector const&) +.. doxygenfunction:: KokkosBlas::swap(const XVector&, const YVector&) update ------ -.. doxygenfunction:: KokkosBlas::update +.. doxygenfunction:: KokkosBlas::update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) +.. doxygenfunction:: KokkosBlas::update(const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index 1d9a3f3fa7..20dbc5ea9a 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -4,4 +4,9 @@ BLAS2 -- KokkosKernels blas2 interfaces gemv ---- .. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) -.. doxygenfunction:: KokkosBlas::gemv(const typename AViewType::execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) +.. doxygenfunction:: KokkosBlas::gemv(const execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) + +ger +---- +.. doxygenfunction:: KokkosBlas::ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) +.. doxygenfunction:: KokkosBlas::ger(const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst index 3fa4e3e9c7..fea3dc252a 100644 --- a/docs/developer/apidocs/blas3.rst +++ b/docs/developer/apidocs/blas3.rst @@ -3,5 +3,19 @@ BLAS3 -- KokkosKernels blas3 interfaces gemm ---- +.. doxygenfunction:: KokkosBlas::gemm(const execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) .. doxygenfunction:: KokkosBlas::gemm(const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) -.. doxygenfunction:: KokkosBlas::gemm(const typename CViewType::execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) + +trmm +---- +.. doxygenfunction:: KokkosBlas::trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) +.. doxygenfunction:: KokkosBlas::trmm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) + +trtri +----- +.. doxygenfunction:: KokkosBlas::trtri + +trsm +---- +.. doxygenfunction:: KokkosBlas::trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) +.. doxygenfunction:: KokkosBlas::trsm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index ed877ac567..f73b507439 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -11,6 +11,11 @@ ccsmatrix .. doxygenclass:: KokkosSparse::CcsMatrix :members: +coomatrix +--------- +.. doxygenclass:: KokkosSparse::CooMatrix + :members: + crs2ccs ------- .. doxygenfunction:: KokkosSparse::crs2ccs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, RowMapViewType row_map, ColIdViewType col_ids) @@ -21,14 +26,21 @@ ccs2crs .. doxygenfunction:: KokkosSparse::ccs2crs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, ColMapViewType col_map, RowIdViewType row_ids) .. doxygenfunction:: KokkosSparse::ccs2crs(KokkosSparse::CcsMatrix &ccsMatrix) +coo2crs +------- +.. doxygenfunction:: KokkosSparse::coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, DataViewType data) +.. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) + +crs2coo +------- +.. doxygenfunction:: KokkosSparse::crs2coo(OrdinalType, OrdinalType, SizeType, ValViewType, RowMapViewType, ColIdViewType) +.. doxygenfunction:: KokkosSparse::crs2coo(KokkosSparse::CrsMatrix &crsMatrix) + spmv ---- - -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&) -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) -.. doxygenfunctions:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) +.. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) trsv @@ -37,8 +49,39 @@ trsv spgemm ------ -.. doxygenfunction:: KokkosSparse::spgemm +.. doxygenfunction:: spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) +.. doxygenfunction:: spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) +.. doxygenfunction:: spgemm(const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode) + +block_spgemm +------------ +.. doxygenfunction:: block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, const bool transposeA, const BMatrixType& B,const bool transposeB, CMatrixType& C) +.. doxygenfunction:: block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) + +gauss_seidel +------------ +.. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) + +block_gauss_seidel +------------------ +.. doxygenfunction:: block_gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: block_gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols,typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_block_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_block_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_block_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) + +par_ilut +-------- +.. doxygenfunction:: par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, URowMapType& U_rowmap) +.. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values) +.. doxygenclass:: KokkosSparse::Experimental::PAR_ILUTHandle + :members: -gauss +gmres ----- -.. doxygenfunction:: KokkosSparse::gauss +.. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) diff --git a/docs/developer/build_doc.rst b/docs/developer/build_doc.rst index dd3d357286..6ccd0dccf7 100644 --- a/docs/developer/build_doc.rst +++ b/docs/developer/build_doc.rst @@ -15,4 +15,6 @@ Building Developer Documentation cmake -DKokkosKernels_ENABLE_DOCS:BOOL=ON /path/to/kokkos-kernels make Doxygen make Sphinx - open build/docs/docs/sphinx/index.html \ No newline at end of file + open build/docs/docs/sphinx/index.html + +Alternatively, pass the --enable-docs option to cm_generate_makefile.bash. diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst index 0b02ebf190..d9b7d31256 100644 --- a/docs/developer/contrib.rst +++ b/docs/developer/contrib.rst @@ -24,6 +24,69 @@ In general, we prefer that the prototype has the doxygen style comment rather th KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &); +.. code-block:: + :caption: Type Doxygen Style Example + + /// \class CooMatrix + /// + /// \brief Coordinate format implementation of a sparse matrix. + /// + /// \tparam ScalarType The type of scalar entries in the sparse matrix. + /// \tparam OrdinalType The type of index entries in the sparse matrix. + /// \tparam Device The Kokkos Device type. + /// "Coo" stands for "coordinate format". + template + class CooMatrix { + public: + //! Type of each value in the matrix + using scalar_type = ScalarType; + + private: + size_type m_num_rows, m_num_cols; + + public: + //! The data in the matrix + scalar_type data; + + /// \brief Default constructor; constructs an empty sparse matrix. + KOKKOS_INLINE_FUNCTION + CooMatrix() : m_num_rows(0), m_num_cols(0) {} + +**NOTE:** To have vscode generate the "\\\\\\" style stubs: + +1. install the C/C++ IntelliSense, debugging, and code browsing extension. + +2. go to Settings, Extensions, C/C++, Doxygen Documentation Generator Settings, and ensure the setting for Doxdocgen is "\\\\\\". + +3. place your cursor on the line above `template ...` and type "\\\\\\". + +Including your documentation with directives +-------------------------------------------- +Rather than have the documentation generation system default to generating documentation for the entire code base, +we opt-in to what we would like to include in the generated documentation. To opt-in, simply place the publicly facing +function signature or the class name in the appropriate ReStructuredText file. For example, to document a sparse +function and class open up kokkos-kernels/docs/developer/apidocs/sparse.rst: + +.. code-block:: + :caption: Function signature example + + coo2crs + ------- + .. doxygenfunction:: KokkosSparse::coo2crs(DimType, DimType, RowViewType, ColViewType, DataViewType) + .. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) + +Note that only the signature is required. One may specify the parameter names and any default values, but this is not required. + +.. code-block:: + :caption: User defined type example + + coomatrix + --------- + .. doxygenclass:: KokkosSparse::CooMatrix + :members: + +For a full list of available directives, see https://breathe.readthedocs.io/en/latest/. + Library policies ---------------- diff --git a/docs/developer/index.rst b/docs/developer/index.rst index 7ee05f98ae..58f89084ac 100644 --- a/docs/developer/index.rst +++ b/docs/developer/index.rst @@ -7,4 +7,6 @@ Developer Manual Source Code Documentation Building the Documentation Code Style Guide - Contributing \ No newline at end of file + Contributing + Profiling + \ No newline at end of file diff --git a/docs/developer/profiling.rst b/docs/developer/profiling.rst new file mode 100644 index 0000000000..326281ab83 --- /dev/null +++ b/docs/developer/profiling.rst @@ -0,0 +1,24 @@ +Profiling +========= + +Compile Times +------------- +1. Select a clang compiler +2. Configure and include `-ftime-trace` in your CXX FLAGS (this works with clang+cuda). +3. Clone and build https://github.com/aras-p/ClangBuildAnalyzer. Put the binary directory in your `PATH`. +4. Compile Kokkos and KokkosKernels +5. Create a directory called `ftime-trace-artifacts` in your build directory +6. Copy the json files you care about in this directory, for example: + +.. code-block:: + + cp ./{sparse,blas}/unit_test/CMakeFiles/*.dir/backends/*.json ftime-trace-artifacts/ + +7. Run `ClangBuildAnalyzer`: + +.. code-block:: + + ClangBuildAnalyzer --all ftime-trace-artifacts/ profile.txt + ClangBuildAnalyzer --analyze profile.txt > analyze.txt + +8. Open `analyze.txt` \ No newline at end of file diff --git a/docs/developer/testing.rst.parked b/docs/developer/testing.rst.parked new file mode 100644 index 0000000000..c6bb810dc3 --- /dev/null +++ b/docs/developer/testing.rst.parked @@ -0,0 +1,16 @@ +Testing +======= + +Test matrix +----------------------------------- + +Unless noted otherwise in `Types`, we test with `float`. + +Format of name column: COMPILERS_BACKENDS_LAYOUTS_TPLS. + +.. csv-table:: :rst:dir:`Test Matrix` + :header: "Name", "Architectures", "Compilers", "Backends", "Layouts", "Types" + + "NIGHTLY_GCC930_CUDA11_OPENMP_CUDA_LEFT_BLAS_LAPACK", "Power9, Volta70, Pascal60", "GNU 9.3.0, NVCC 11.0.1", "OpenMP, Cuda", "Left" + "PR_GCC1030_INTEL20_SERIAL_OPENMP_RIGHT_BLAS_LAPACK", "Broadwell", "GCC, INTEL", "Serial, OpenMP", "Right" + "PR_ARMPL2110_SERIAL_OPENMP_LEFT_BLAS_LAPACK", "A64FX", "ARMPL 21.1.0", "Serial, OpenMP", "LayoutLeft" \ No newline at end of file diff --git a/docs/developer/write_developer_doc.rst b/docs/developer/write_developer_doc.rst.parked similarity index 100% rename from docs/developer/write_developer_doc.rst rename to docs/developer/write_developer_doc.rst.parked diff --git a/docs/developer/write_user_doc.rst b/docs/developer/write_user_doc.rst.parked similarity index 100% rename from docs/developer/write_user_doc.rst rename to docs/developer/write_user_doc.rst.parked diff --git a/docs/index.rst b/docs/index.rst index db873e9a3b..cd8a174ff9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ Kokkos Kernels documentation: Under Construction -========================================== +================================================ .. toctree:: :maxdepth: 2 diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index 057902b6f2..b543ddaad6 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -236,8 +236,7 @@ int main(int /*argc*/, char ** /*argv*/) { using Layout = typename AMatrixValueView::array_layout; using EXSP = typename AMatrixValueView::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in deleted file mode 100644 index 23e1699557..0000000000 --- a/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ -namespace KokkosGraph { -namespace Impl { -@GRAPH_COLOR_D1_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index aa8180fae7..a359956a23 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -828,7 +828,6 @@ struct D2_MIS_FixedPriority { Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; - int numIter = 0; while (workRemain) { // do another iteration Kokkos::parallel_for( @@ -853,7 +852,6 @@ struct D2_MIS_FixedPriority { // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; - numIter++; } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. diff --git a/graph/impl/KokkosGraph_color_d1_spec.hpp b/graph/impl/KokkosGraph_color_d1_spec.hpp index af5e2f0751..5d66240763 100644 --- a/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -120,6 +120,4 @@ struct COLOR_D1>, \ false, true>; -#include - #endif diff --git a/graph/src/KokkosGraph_Triangle.hpp b/graph/src/KokkosGraph_Triangle.hpp index 5c7360a88a..0a878891ce 100644 --- a/graph/src/KokkosGraph_Triangle.hpp +++ b/graph/src/KokkosGraph_Triangle.hpp @@ -232,11 +232,8 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, // if 2, we do an interleaved sort. } { - if (sh->get_sort_option() != -1) { - sort_decreasing_order = sh->get_sort_option(); - } - KokkosKernels::Impl::kk_sort_by_row_size( + KokkosSparse::Impl::kk_sort_by_row_size( m, row_mapA.data(), new_indices.data(), sort_decreasing_order, ExecutionSpace().concurrency()); } @@ -264,7 +261,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); - KokkosKernels::Impl::kk_get_lower_triangle< + KokkosSparse::Impl::kk_get_lower_triangle< alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( @@ -292,7 +289,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); - KokkosKernels::Impl::kk_get_lower_triangle< + KokkosSparse::Impl::kk_get_lower_triangle< alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( @@ -334,7 +331,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); - KokkosKernels::Impl::kk_get_lower_triangle< + KokkosSparse::Impl::kk_get_lower_triangle< alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( @@ -342,7 +339,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, lower_triangular_matrix_entries, null_values, new_indices, handle->is_dynamic_scheduling()); } - KokkosKernels::Impl:: + KokkosSparse::Impl:: kk_create_incidence_tranpose_matrix_from_lower_triangle< row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, @@ -357,7 +354,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_AI: { // these are the algorithms that requires the incidence matrix. - KokkosKernels::Impl::kk_create_incidence_matrix_from_original_matrix< + KokkosSparse::Impl::kk_create_incidence_matrix_from_original_matrix< alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap, diff --git a/master_history.txt b/master_history.txt index 6c9f253c07..e7ed75b7f0 100644 --- a/master_history.txt +++ b/master_history.txt @@ -21,3 +21,4 @@ tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6 tag: 4.0.00 date: 02/23/2023 master: b4014bf2 release: a10dff20 tag: 4.0.01 date: 04/26/2023 master: b9c1bab7 release: 8809e41c +tag: 4.1.00 date: 06/20/2023 master: 1331baf1 release: 14ad220a diff --git a/ode/CMakeLists.txt b/ode/CMakeLists.txt new file mode 100644 index 0000000000..9d92dc07ba --- /dev/null +++ b/ode/CMakeLists.txt @@ -0,0 +1,15 @@ +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/ode/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/ode/impl) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/ode/unit_test) + +IF (NOT KokkosKernels_ENABLE_COMPONENT_BATCHED) + MESSAGE("blas enabled and batched not enabled, we need to include some headers manually!") + LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched) + LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/src) + LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/impl) +ENDIF() + + +# Adding unit-tests +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode) diff --git a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp new file mode 100644 index 0000000000..85a8ec0b45 --- /dev/null +++ b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -0,0 +1,280 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP +#define KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP + +#include + +namespace KokkosODE { +namespace Impl { +//===================================================================== +// Generalized RK Explicit ODE solver with embedded error estimation +//===================================================================== + +// Methods supported: +// Forward Euler (RKFE) +// Euler-Heun Method (RKEH) +// Fehlberg 1-2 (RKF12) +// Bogacki-Shampine (RKBS) +// Runge-Kutta 4th order (RK4) +// Fehlberg Method (RKF45) +// Cash-Karp Method (RKCK) +// Dormand-Prince Method (RKDP) + +// Format follows form of Butcher Tableau +// c1| a00 +// c2| a10 a11 +// c3| a20 a21 a22 +// c4| a30 a31 a32 +// . | . . . +// . | . . . +// . | . . . +// cs| as0 as1 . . . . . . ass +//-------------------------------- +// | b0 b1 b2 b3 . . . bs +// | e0 e1 e2 e3 . . . es +// +// And is always in lower triangular form for explicit methods +// For explicit methods the methods on the diagonal will always be zero. +// +// Here, nstages = s = number of stages. +// 'order' refers to the accuracy of the method. +// The array of aij coefficients is ordered by rows as: a = +// {a00,a10,a11,a20,a21,a22....} +// e contains coefficient for error estimation + +template +struct ButcherTableau {}; + +template <> +struct ButcherTableau<0, 0> // Forward Euler +{ + static constexpr int order = 1; + static constexpr int nstages = 1; + + Kokkos::Array a{{1}}; + Kokkos::Array b{{1}}; + Kokkos::Array c{{0}}; + Kokkos::Array e{{0}}; +}; + +// Coefficients obtained from: (see page 39) +// Iserles, A. +// A First Course in the Numerical Analysis of Differential Equations." +// Cambridge: Cambridge University Press. (2008). +// https://doi:10.1017/CBO9780511995569 +template <> +struct ButcherTableau<1, 1> // Euler-Heun Method +{ + static constexpr int order = 2; + static constexpr int nstages = 2; // total dimensions, nstagesxnstages system + Kokkos::Array a{ + {0.0, 1.0, + 0.0}}; //(nstages*nstages+nstages)/2 size of lower triangular matrix + Kokkos::Array b{{0.5, 0.5}}; + Kokkos::Array c{{0.0, 1.0}}; + Kokkos::Array e{{-0.5, 0.5}}; +}; + +// Coefficients obtained from: +// Fehlberg, E. +// "Klassische Runge-Kutta-Formeln vierter und niedrigerer Ordnung mit +// Schrittweiten-Kontrolle und ihre Anwendung auf Wärmeleitungsprobleme." +// Computing 6, 61–71 (1970). https://doi.org/10.1007/BF02241732 +template <> +struct ButcherTableau<1, 2> // Known as Fehlberg 1-2 method +{ + static constexpr int order = 2; + static constexpr int nstages = 3; + Kokkos::Array a{ + {0.0, 0.5, 0.0, 1.0 / 256.0, 255.0 / 256.0, 0.0}}; + Kokkos::Array b{{1.0 / 512.0, 255.0 / 256.0, 1. / 512}}; + Kokkos::Array c{{0.0, 1.0 / 2.0, 1.0}}; + Kokkos::Array e{ + {1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; +}; + +// Coefficients obtained from: +// P. Bogacki, L.F. Shampine, +// "A 3(2) pair of Runge - Kutta formulas," +// Applied Mathematics Letters, Volume 2, Issue 4, 1989, +// https://doi.org/10.1016/0893-9659(89)90079-7. +template <> +struct ButcherTableau<2, 3> // Bogacki-Shampine method +{ + static constexpr int order = 3; + static constexpr int nstages = 4; + Kokkos::Array a{ + {0.0, 0.5, 0.0, 0.0, 3.0 / 4.0, 0.0, 2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, + 0.0}}; + Kokkos::Array b{{2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, 0.0}}; + Kokkos::Array c{{0.0, 0.5, 0.75, 1.0}}; + Kokkos::Array e{{2.0 / 9.0 - 7.0 / 24.0, 1.0 / 3.0 - 0.25, + 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; +}; + +// Coefficients obtained from: +// Hull, David G. +// "Fourth-order Runge-Kutta integration with stepsize control." +// AIAA Journal 15.10 (1977): 1505-1507. +template <> +struct ButcherTableau<3, 3> // RK4 +{ + static constexpr int order = 4; + static constexpr int nstages = 4; + Kokkos::Array a{ + {0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0}}; + Kokkos::Array b{ + {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}}; + Kokkos::Array c{{0.0, 0.5, 0.5, 1.0}}; + Kokkos::Array e{{1.0 / 6.0, 0.0, -1.0 / 3.0, 1.0 / 6.0}}; +}; + +// Coefficients obtained from: +// Fehlberg, E. +// "Klassische Runge-Kutta-Formeln vierter und niedrigerer Ordnung mit +// Schrittweiten-Kontrolle und ihre Anwendung auf Wärmeleitungsprobleme." +// Computing 6, 61–71 (1970). https://doi.org/10.1007/BF02241732 +template <> +struct ButcherTableau<4, 5> // Fehlberg Method +{ + static constexpr int order = 5; + static constexpr int nstages = 6; + Kokkos::Array a{{0.0, + 0.25, + 0.0, + 3.0 / 32.0, + 9.0 / 32.0, + 0.0, + 1932.0 / 2197.0, + -7200.0 / 2197.0, + 7296.0 / 2197.0, + 0.0, + 439.0 / 216.0, + -8.0, + 3680.0 / 513.0, + -845.0 / 4104.0, + 0.0, + -8.0 / 27.0, + 2.0, + -3544.0 / 2565.0, + 1859.0 / 4104.0, + -11.0 / 40.0, + 0.0}}; + Kokkos::Array b{{16.0 / 135.0, 0.0, 6656.0 / 12825.0, + 28561.0 / 56430.0, -9.0 / 50.0, + 2.0 / 55.0}}; + Kokkos::Array c{ + {0.0, 0.25, 3.0 / 8.0, 12.0 / 13.0, 1.0, 0.5}}; + Kokkos::Array e{ + {16.0 / 135.0 - 25.0 / 216.0, 0.0, 6656.0 / 12825.0 - 1408.0 / 2565.0, + 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; +}; + +// Coefficients obtained from: +// J. R. Cash and Alan H. Karp. +// "A variable order Runge-Kutta method for initial value problems with rapidly +// varying right-hand sides." ACM Trans. Math. Softw. 16, 3 (Sept. 1990), +// 201–222. https://doi.org/10.1145/79505.79507 +template <> +struct ButcherTableau<4, 5, 1> // Cash-Karp +{ + static constexpr int order = 5; + static constexpr int nstages = 6; + Kokkos::Array a{ + {0.0, + 0.2, + 0.0, + 3.0 / 40.0, + 9.0 / 40.0, + 0.0, + 0.3, + -0.9, + 1.2, + 0.0, + -11.0 / 54.0, + 2.5, + -70.0 / 27.0, + 35.0 / 27.0, + 0.0, + 1631.0 / 55296.0, + 175.0 / 512.0, + 575.0 / 13824.0, + 44275.0 / 110592.0, + 253.0 / 4096.0, + 0.0}}; + Kokkos::Array b{ + {37.0 / 378.0, 0.0, 250.0 / 621.0, 125.0 / 594.0, 0.0, 512.0 / 1771.0}}; + Kokkos::Array c{{0.0, 0.2, 0.3, 0.6, 1.0, 7.0 / 8.0}}; + Kokkos::Array e{{37.0 / 378.0 - 2825.0 / 27648.0, 0.0, + 250.0 / 621.0 - 18575.0 / 48384.0, + 125.0 / 594.0 - 13525.0 / 55296.0, + -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; +}; + +// Coefficients obtained from: +// J.R. Dormand, P.J. Prince, +// "A family of embedded Runge-Kutta formulae", +// Journal of Computational and Applied Mathematics, Volume 6, Issue 1, 1980, +// https://doi.org/10.1016/0771-050X(80)90013-3. +template <> +struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP +{ + static constexpr int order = 5; + static constexpr int nstages = 7; + Kokkos::Array a{{0.0, + 0.2, + 0.0, + 3.0 / 40.0, + 9.0 / 40.0, + 0.0, + 44.0 / 45.0, + -56.0 / 15.0, + 32.0 / 9.0, + 0.0, + 19372.0 / 6561.0, + -25360.0 / 2187.0, + 64448.0 / 6561.0, + -212.0 / 729.0, + 0.0, + 9017.0 / 3168.0, + -355.0 / 33.0, + 46732.0 / 5247.0, + 49.0 / 176.0, + -5103.0 / 18656.0, + 0.0, + 35.0 / 384.0, + 0.0, + 500.0 / 1113.0, + 125.0 / 192.0, + -2187.0 / 6784.0, + 11.0 / 84.0, + 0.0}}; + Kokkos::Array b{{35.0 / 384.0, 0.0, 500.0 / 1113.0, + 125.0 / 192.0, -2187.0 / 6784.0, + 11.0 / 84.0, 0.0}}; + Kokkos::Array c{{0.0, 0.2, 0.3, 0.8, 8.0 / 9.0, 1.0, 1.0}}; + Kokkos::Array e{ + {35.0 / 384.0 - 5179.0 / 57600.0, 0.0, 500.0 / 1113.0 - 7571.0 / 16695.0, + 125.0 / 192.0 - 393.0 / 640.0, -2187.0 / 6784.0 + 92097.0 / 339200.0, + 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; +}; + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp new file mode 100644 index 0000000000..791093c8db --- /dev/null +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -0,0 +1,179 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP +#define KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosBlas1_scal.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosODE_RungeKuttaTables_impl.hpp" +#include "KokkosODE_Types.hpp" + +namespace KokkosODE { +namespace Impl { + +// y_new = y_old + dt*sum(b_i*k_i) i in [1, nstages] +// k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] +// we need to compute the k_i and store them as we go +// to use them for k_{i+1} computation. +template +KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, + const bool adaptivity, scalar_type t, + scalar_type dt, const vec_type& y_old, + const vec_type& y_new, const vec_type& temp, + const mv_type& k_vecs) { + const int neqs = ode.neqs; + const int nstages = table.nstages; + + // first set y_new = y_old + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); + } + + // now accumulate y_new += dt*b_i*k_i + { + // we always start with y_new += dt*b_0*k0 + auto k0 = Kokkos::subview(k_vecs, Kokkos::ALL, 0); + ode.evaluate_function(t + table.c[0] * dt, dt, y_old, k0); + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + y_new(eqIdx) += dt * table.b[0] * k0(eqIdx); + } + } + + // Now that we have k0, we can compute all other k_i + // and accumulate them in y_new. + for (int stageIdx = 1; stageIdx < nstages; ++stageIdx) { + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + temp(eqIdx) = 0; + } + + for (int idx = 0; idx < stageIdx; ++idx) { + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + temp(eqIdx) += + table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(eqIdx, idx); + } + } + KokkosBlas::SerialScale::invoke(dt, temp); + KokkosBlas::serial_axpy(1, y_old, temp); + auto k = Kokkos::subview(k_vecs, Kokkos::ALL, stageIdx); + ode.evaluate_function(t + table.c[stageIdx] * dt, dt, temp, k); + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + y_new(eqIdx) += dt * table.b[stageIdx] * k(eqIdx); + } + } + + // Compute estimation of the error using k_vecs and table.e + if (adaptivity == true) { + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + temp(eqIdx) = 0; + for (int stageIdx = 0; stageIdx < nstages; ++stageIdx) { + temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(eqIdx, stageIdx); + } + } + } +} // RKStep + +template +KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( + const ode_type& ode, const table_type& table, + const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, const vec_type& y0, + const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + constexpr scalar_type error_threshold = 1; + bool adapt = params.adaptivity; + if (std::is_same_v>) { + adapt = false; + } + + scalar_type dt = (t_end - t_start) / params.max_steps; + scalar_type t = t_start; + for (int stepIdx = 0; (stepIdx < params.max_steps) && (t < t_end); + ++stepIdx) { + // Set err to be arbitrarily larger than our threshold of 1 + scalar_type error = 2 * error_threshold; + scalar_type tol = 0; + while (error_threshold < error) { + // Take a step of Runge-Kutta integrator + RKStep(ode, table, adapt, t, dt, y0, y, temp, k_vecs); + + // Compute the largest error and decide on + // the size of the next time step to take. + error = 0; + if (adapt) { + // Compute the error + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + error = Kokkos::max(error, Kokkos::abs(temp(eqIdx))); + tol = Kokkos::max( + tol, params.abs_tol + + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), + Kokkos::abs(y0(eqIdx)))); + } + error = error / tol; + + // Reduce the time step if error + // is too large and current step + // is rejected. + if (error > 1) { + dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + } + if (dt < params.min_step_size) + return Experimental::ode_solver_status::MIN_SIZE; + } + } + + // Update y0 to stage the next time step. + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y0(eqIdx) = y(eqIdx); + } + + if (t < t_end) { + // We may want to print the evolution of the solution over time + // with something similar to the statement below but will need + // to generalize it and make it GPU friendly first, also it + // should be guarded when not doing a debug run, this prints + // a lot... + // std::cout << " step " << stepIdx << " t=" << t << ", y={"; + // for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + // std::cout << y(eqIdx) << " "; + // } + // std::cout << "}" << std::endl; + if (adapt) { + // Compute new time increment + dt = dt * + Kokkos::min( + 10.0, + Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); + } else { + // Use same increment + t += dt; + } + } else { + return Experimental::ode_solver_status::SUCCESS; + } + } + + if (t < t_end) return Experimental::ode_solver_status::MAX_STEP; + + return Experimental::ode_solver_status::SUCCESS; +} // RKSolve + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp new file mode 100644 index 0000000000..c41d79c1ef --- /dev/null +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -0,0 +1,141 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_RUNGEKUTTA_HPP +#define KOKKOSODE_RUNGEKUTTA_HPP + +/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \file KokkosODE_RungeKutta.hpp + +#include "Kokkos_Core.hpp" +#include "KokkosODE_Types.hpp" + +#include "KokkosODE_RungeKutta_impl.hpp" + +namespace KokkosODE { +namespace Experimental { + +/// \brief RK_type is an enum tye that conveniently +/// describes the Runge-Kutta methods implemented. +enum RK_type : int { + RKFE = 0, ///< Forward Euler method (no adaptivity available for this method) + RKEH = 1, ///< Euler-Heun method + RKF12 = 2, ///< Fehlberg order 2 method + RKBS = 3, ///< Bogacki-Shampine method + RK4 = 4, ///< Runge-Kutta classic order 4 method + RKF45 = 5, ///< Fehlberg order 5 method + RKCK = 6, ///< Cash-Karp method + RKDP = 7 ///< Dormand-Prince method +}; + +template +struct RK_Tableau_helper { + using table_type = void; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<0, 0>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<1, 1>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<1, 2>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<2, 3>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<3, 3>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<4, 5>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<4, 6>; +}; + +/// \brief Unspecialized version of the RungeKutta solvers +/// +/// \tparam RK_type an RK_type enum value used to specify +/// which Runge Kutta method is to be used. +template +struct RungeKutta { + using table_type = typename RK_Tableau_helper::table_type; + + /// \brief order returns the convergence order of the method + KOKKOS_FUNCTION + static int order() { return table_type::order; } + + /// \brief num_stages returns the number of stages used by the method + KOKKOS_FUNCTION + static int num_stages() { return table_type::nstages; } + + /// \brief Solve integrates an ordinary differential equation + /// + /// The integration is carried with the method specified as template + /// parameter to the RungeKutta struct. This method is static and + /// marked as KOKKOS_FUNCTION so it can be used on host and device. + /// + /// \tparam ode_type the type of the ode object to integrated + /// \tparam vec_type a rank-1 view + /// \tparam mv_type a rank-2 view + /// \tparam scalar_type a floating point type + /// + /// \param ode [in]: the ode to integrate + /// \param params [in]: standard input parameters of ODE integrators + /// \param t_start [in]: time at which the integration starts + /// \param t_end [in]: time at which the integration stops + /// \param y0 [in/out]: vector of initial conditions, set to the solution + /// at the end of the integration + /// \param y [out]: vector of solution at t_end + /// \param temp [in]: vector for temporary storage + /// \param k_vecs [in]: vectors for temporary storage + /// + /// \return ode_solver_status an enum that describes success of failure + /// of the integration method once it at terminated. + template + KOKKOS_FUNCTION static ode_solver_status Solve( + const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, const vec_type& y0, + const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + table_type table; + return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, + temp, k_vecs); + } +}; + +} // namespace Experimental +} // namespace KokkosODE +#endif // KOKKOSODE_RUNGEKUTTA_HPP diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp new file mode 100644 index 0000000000..136ff75536 --- /dev/null +++ b/ode/src/KokkosODE_Types.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_TYPES_HPP +#define KOKKOSODE_TYPES_HPP + +namespace KokkosODE { +namespace Experimental { + +enum ode_solver_status { SUCCESS = 0, MAX_STEP = 1, MIN_SIZE = 2 }; + +struct ODE_params { + bool adaptivity; + int num_steps, max_steps; + double abs_tol, rel_tol, min_step_size; + + // Constructor that only specify the desired number of steps. + // In this case no adaptivity is provided, the time step will + // be constant such that dt = (tend - tstart) / num_steps; + KOKKOS_FUNCTION + ODE_params(const int num_steps_) + : adaptivity(false), + num_steps(num_steps_), + max_steps(num_steps_), + abs_tol(0), + rel_tol(0), + min_step_size(0) {} + + /// ODE_parms construtor for adaptive time stepping. + KOKKOS_FUNCTION + ODE_params(const int num_steps_, const int max_steps_, const double abs_tol_, + const double rel_tol_, const double min_step_size_) + : adaptivity(true), + num_steps(num_steps_), + max_steps(max_steps_), + abs_tol(abs_tol_), + rel_tol(rel_tol_), + min_step_size(min_step_size_) {} +}; + +} // namespace Experimental +} // namespace KokkosODE +#endif // KOKKOSODE_TYPES_HPP diff --git a/ode/unit_test/CMakeLists.txt b/ode/unit_test/CMakeLists.txt new file mode 100644 index 0000000000..90d6c45c49 --- /dev/null +++ b/ode/unit_test/CMakeLists.txt @@ -0,0 +1,108 @@ +##################### +# # +# Add include files # +# # +##################### + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/test_common) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/test_common) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/src) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/src) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode/src) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode/src) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode/impl) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode/impl) + +##################### +# # +# Define unit-tests # +# # +##################### + +##################### +# # +# Add GPU backends # +# # +##################### +IF (KOKKOS_ENABLE_CUDA) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMPTARGET) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # ode_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_ODE.cpp + # COMPONENTS ode + # ) +ENDIF () + + + +##################### +# # +# Add CPU backends # +# # +##################### +IF (KOKKOS_ENABLE_SERIAL) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMP) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_THREADS) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_ODE.cpp + COMPONENTS ode + ) +ENDIF () + diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in b/ode/unit_test/Test_ODE.hpp similarity index 75% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in rename to ode/unit_test/Test_ODE.hpp index 42982920fd..dd929c48fc 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in +++ b/ode/unit_test/Test_ODE.hpp @@ -13,12 +13,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER +#ifndef TEST_ODE_HPP +#define TEST_ODE_HPP -#ifndef KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_DOT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif +#include "Test_ODE_RK.hpp" +#include "Test_ODE_RK_chem.hpp" + +#endif // TEST_ODE_HPP diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp new file mode 100644 index 0000000000..1e851108f3 --- /dev/null +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -0,0 +1,481 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_RungeKutta.hpp" + +namespace Test { + +// damped harmonic undriven oscillator +// m y'' + c y' + k y = 0 +// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + +// phi) omega_0 = sqrt(k/m); xi = c / sqrt(4*m*k) A and phi depend on y(0) and +// y'(0); Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * +// omega_0 * t) Change of variables: X = [x ] +// [x'] +// Leads to X' = A*X with A = [ 0 1] +// [-d 0] +// with d = k/m - (c/(2m)^2) = (1 - xi^2)*omega_0^2 +struct duho { + constexpr static int neqs = 2; + const double m, c, k, d; + const double a11 = 0, a12 = 1, a21, a22; + + duho(const double m_, const double c_, const double k_) + : m(m_), + c(c_), + k(k_), + d(k_ / m_ - (c_ * c_) / (4 * m_ * m_)), + a21(-k / m), + a22(-c / m){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = a11 * y(0) + a12 * y(1); + f(1) = a21 * y(0) + a22 * y(1); + } + + template + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, + const vec_type& y) const { + using KAT = Kokkos::ArithTraits; + + const double gamma = c / (2 * m); + const double omega = KAT::sqrt(k / m - gamma * gamma); + const double phi = KAT::atan((y0(1) + gamma * y0(0)) / (y0(0) * omega)); + const double A = y0(0) / KAT::cos(phi); + + y(0) = A * KAT::cos(omega * t - phi) * KAT::exp(-t * gamma); + y(1) = -y(0) * gamma - + omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); + } + +}; // duho + +template +struct solution_wrapper { + ode_type ode; + scalar_type t; + vec_type y_old, y_ref; + + solution_wrapper(const ode_type& ode_, const scalar_type t_, + const vec_type& y_old_, const vec_type& y_ref_) + : ode(ode_), t(t_), y_old(y_old_), y_ref(y_ref_){}; + + KOKKOS_FUNCTION + void operator()(const int /*idx*/) const { ode.solution(t, y_old, y_ref); } +}; + +template +struct RKSolve_wrapper { + using ode_params = KokkosODE::Experimental::ODE_params; + + ode_type my_ode; + ode_params params; + scalar_type tstart, tend; + int max_steps; + vec_type y_old, y_new, tmp; + mv_type kstack; + + RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, + const scalar_type tstart_, const scalar_type tend_, + const vec_type& y_old_, const vec_type& y_new_, + const vec_type& tmp_, const mv_type& kstack_) + : my_ode(my_ode_), + params(params_), + tstart(tstart_), + tend(tend_), + y_old(y_old_), + y_new(y_new_), + tmp(tmp_), + kstack(kstack_) {} + + KOKKOS_FUNCTION + void operator()(const int /*idx*/) const { + KokkosODE::Experimental::RungeKutta::Solve( + my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + } +}; + +template +void test_method(const std::string label, ode_type& my_ode, + const scalar_type& tstart, const scalar_type& tend, + const int num_steps, vec_type& y_old, vec_type& y_new, + const int order, const int num_stages, + const Kokkos::View& ks, + const Kokkos::View& sol, + typename vec_type::HostMirror y_ref_h) { + using execution_space = typename vec_type::execution_space; + using solver_type = KokkosODE::Experimental::RungeKutta; + + KokkosODE::Experimental::ODE_params params(num_steps); + vec_type tmp("tmp vector", my_ode.neqs); + mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); + + Kokkos::RangePolicy my_policy(0, 1); + RKSolve_wrapper + solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + auto kstack_h = Kokkos::create_mirror_view(kstack); + Kokkos::deep_copy(kstack_h, kstack); + + EXPECT_EQ(solver_type::order(), order); + EXPECT_EQ(solver_type::num_stages(), num_stages); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\n" << label << std::endl; + std::cout << " order: " << solver_type::order() << std::endl; + std::cout << " number of stages: " << solver_type::num_stages() << std::endl; +#else + (void)label; +#endif + for (int stageIdx = 0; stageIdx < solver_type::num_stages(); ++stageIdx) { + EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(0, stageIdx), 1e-8); + EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(1, stageIdx), 1e-8); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << " k" << stageIdx << "={" << kstack_h(0, stageIdx) << ", " + << kstack_h(1, stageIdx) << "}" << std::endl; +#endif + } + EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); + EXPECT_NEAR_KK(sol(1), y_new_h(1), 1e-8); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << " y={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << " error={" + << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) + << ", " + << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) + << "}" << std::endl; +#else + (void)y_ref_h; +#endif + +} // test_method + +template +void test_RK() { + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + + duho my_oscillator(1, 1, 4); + const int neqs = my_oscillator.neqs; + + vec_type y("solution", neqs), f("function", neqs); + auto y_h = Kokkos::create_mirror(y); + y_h(0) = 1; + y_h(1) = 0; + Kokkos::deep_copy(y, y_h); + + constexpr double tstart = 0, tend = 0.01; + constexpr int num_steps = 1000; + double dt = (tend - tstart) / num_steps; + vec_type y_new("y new", neqs), y_old("y old", neqs); + + // Since y_old_h will be reused to set initial conditions + // for each method tested we do not want to use + // create_mirror_view which would not do a copy + // when y_old is in HostSpace. + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + + // First compute analytical solution as reference + // and to evaluate the error from each RK method. + vec_type y_ref("reference value", neqs); + auto y_ref_h = Kokkos::create_mirror(y_ref); + { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::RangePolicy my_policy(0, 1); + solution_wrapper wrapper(my_oscillator, tstart + dt, y_old, y_ref); + Kokkos::parallel_for(my_policy, wrapper); + + Kokkos::deep_copy(y_ref_h, y_ref); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nAnalytical solution" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" + << std::endl; +#endif + } + + // We perform a single step using a RK method + // and check the values for ki and y_new against + // expected values. + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[2] = {0, -4}; + Kokkos::View ks(ks_raw, 2, 1); + double sol_raw[2] = {1, -0.04}; + Kokkos::View sol(sol_raw, 2); + test_method( + "Euler-Forward", my_oscillator, tstart, tend, 1, y_old, y_new, 1, 1, ks, + sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[4] = {0, -0.04, -4, -3.96}; + Kokkos::View ks(ks_raw, 2, 2); + double sol_raw[2] = {0.9998, -0.0398}; + Kokkos::View sol(sol_raw, 2); + test_method( + "Euler-Heun", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 2, ks, + sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[6] = {0, -0.02, -0.03980078, -4, -3.98, -3.95940234}; + Kokkos::View ks(ks_raw, 2, 3); + double sol_raw[2] = {0.9998, -0.03979999}; + Kokkos::View sol(sol_raw, 2); + test_method( + "RKF-12", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 3, ks, sol, + y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, + -4, -3.98, -3.96955, -3.95940467}; + Kokkos::View ks(ks_raw, 2, 4); + double sol_raw[2] = {0.99980067, -0.039798}; + Kokkos::View sol(sol_raw, 2); + test_method( + "RKBS", my_oscillator, tstart, tend, 1, y_old, y_new, 3, 4, ks, sol, + y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, + -0.03979499, -0.0199505, -4, -3.99, + -3.98491562, -3.96257222, -3.95941166, -3.97984883}; + Kokkos::View ks(ks_raw, 2, 6); + double sol_raw[2] = {0.99980067, -0.03979801}; + Kokkos::View sol(sol_raw, 2); + test_method( + "RKF-45", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, sol, + y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, + -0.03979862, -0.03484563, -4, -3.992, + -3.987946, -3.97578551, -3.95940328, -3.96454357}; + Kokkos::View ks(ks_raw, 2, 6); + double sol_raw[2] = {0.99980067, -0.03979801}; + Kokkos::View sol(sol_raw, 2); + test_method( + "Cash-Karp", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, + sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[14] = {0, -0.008, -0.011982, -0.03187008, + -0.03539333, -0.0397954, -0.03979801, -4, + -3.992, -3.987946, -3.96762048, -3.96398013, + -3.95941068, -3.95940467}; + Kokkos::View ks(ks_raw, 2, 7); + double sol_raw[2] = {0.99980067, -0.03979801}; + Kokkos::View sol(sol_raw, 2); + test_method( + "Dormand-Prince", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 7, + ks, sol, y_ref_h); + } + +} // test_RK + +template +void test_rate(ode_type& my_ode, const scalar_type& tstart, + const scalar_type& tend, + Kokkos::View num_steps, + typename vec_type::HostMirror& y_old_h, + typename vec_type::HostMirror& y_ref_h, + typename vec_type::HostMirror& error) { + using execution_space = typename vec_type::execution_space; + using solver_type = KokkosODE::Experimental::RungeKutta; + + vec_type tmp("tmp vector", my_ode.neqs); + mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); + + vec_type y_new("solution", my_ode.neqs); + vec_type y_old("intial conditions", my_ode.neqs); + auto y_new_h = Kokkos::create_mirror(y_new); + + Kokkos::RangePolicy my_policy(0, 1); + for (int idx = 0; idx < num_steps.extent_int(0); ++idx) { + KokkosODE::Experimental::ODE_params params(num_steps(idx)); + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + RKSolve_wrapper + solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + Kokkos::deep_copy(y_new_h, y_new); + error(idx) = Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)); + +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + scalar_type dt = (tend - tstart) / num_steps(idx); + std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" + << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; +#endif + } + +} // test_method + +template +void test_convergence_rate() { + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + + duho my_oscillator(1, 1, 4); + const int neqs = my_oscillator.neqs; + + vec_type y("solution", neqs), f("function", neqs); + auto y_h = Kokkos::create_mirror(y); + y_h(0) = 1; + y_h(1) = 0; + Kokkos::deep_copy(y, y_h); + + constexpr double tstart = 0, tend = 1.024; + Kokkos::View num_steps("Max Steps", 8); + num_steps(0) = 512; + num_steps(1) = 256; + num_steps(2) = 128; + num_steps(3) = 64; + num_steps(4) = 32; + num_steps(5) = 16; + num_steps(6) = 8; + num_steps(7) = 4; + vec_type y_new("y new", neqs), y_old("y old", neqs); + + // Since y_old_h will be reused to set initial conditions + // for each method tested we do not want to use + // create_mirror_view which would not do a copy + // when y_old is in HostSpace. + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + + // First compute analytical solution as reference + // and to evaluate the error from each RK method. + vec_type y_ref("reference value", neqs); + auto y_ref_h = Kokkos::create_mirror(y_ref); + { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::RangePolicy my_policy(0, 1); + solution_wrapper wrapper(my_oscillator, tend, y_old, y_ref); + Kokkos::parallel_for(my_policy, wrapper); + + Kokkos::deep_copy(y_ref_h, y_ref); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nAnalytical solution" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" + << std::endl; +#endif + } + + typename vec_type::HostMirror error("error", num_steps.extent(0)); + test_rate( + my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + + for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { + double expected_ratio = + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), + KokkosODE::Impl::ButcherTableau<1, 1>::order); + double actual_ratio = error(idx + 1) / error(idx); + EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.15); + +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / + Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio + << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; +#endif + } + + Kokkos::deep_copy(error, 0); + test_rate( + my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + + for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { + double expected_ratio = + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), + KokkosODE::Impl::ButcherTableau<2, 3>::order); + double actual_ratio = error(idx + 1) / error(idx); + EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); + +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / + Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio + << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; +#endif + } + + Kokkos::deep_copy(error, 0); + test_rate( + my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + + for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { + double expected_ratio = + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), + KokkosODE::Impl::ButcherTableau<4, 5>::order); + double actual_ratio = error(idx + 1) / error(idx); + EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); + +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / + Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio + << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; +#endif + } +} // test_convergence_rate + +} // namespace Test + +int test_RK() { + Test::test_RK(); + return 1; +} + +int test_RK_conv_rate() { + Test::test_convergence_rate(); + return 1; +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, RKSolve_serial) { test_RK(); } +TEST_F(TestCategory, RK_conv_rate) { test_RK_conv_rate(); } +#endif diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp new file mode 100644 index 0000000000..2adc202ddc --- /dev/null +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -0,0 +1,198 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_RungeKutta.hpp" + +namespace Test { + +// R1 = 1e-6*1.85e10 * exp(-15618 / T) * (reac) ( 1 – (1- 10^-9) reac) +// d(reac)/dt = -R1 +// d(prod)/dt = R1 +struct chem_model_1 { + constexpr static int neqs = 2; + // constexpr static double alpha = 1e-6*1.85e10; + constexpr static double alpha = 1.85e10; + constexpr static double beta = 15618; + constexpr static double gamma = 1 - 10e-9; + + const double tstart, tend, T0, T1; + + chem_model_1(const double tstart_ = 0, const double tend_ = 100, + const double T0_ = 300, const double T1_ = 800) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = (T1 - T0) * (t - tstart) / (tend - tstart) + T0; + + // Evaluate the chemical reaction rate + f(0) = -alpha * Kokkos::exp(-beta / T) * y(0) * (1 - gamma * y(0)); + f(1) = -f(0); + } +}; + +struct chem_model_2 { + constexpr static int neqs = 7; + constexpr static double alpha1 = 1e-6 * 3334169440721739.0 * 1500; + constexpr static double beta1 = 207850000.0 / 8314.0; + constexpr static double alpha2 = 1e-6 * 49997793980831.89 * 1500; + constexpr static double beta2 = 207850000.0 / 8314.0; + + const double tstart, tend, T0, T1; + + chem_model_2(const double tstart_ = 0, const double tend_ = 1200, + const double T0_ = 300, const double T1_ = 1000) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = (T1 - T0) * (t - tstart) / (1500 - tstart) + T0; + + // Evaluate the chemical reaction rates + double R1 = y(0) * alpha1 * Kokkos::exp(-beta1 / T); + double R2 = y(1) * alpha2 * Kokkos::exp(-beta2 / T); + + // Evaluate the chemical reaction rate + f(0) = -R1; + f(1) = -R2; + f(2) = R1 + 0.08 * R2; + f(3) = 0.147 * R2; + f(4) = 0.453 * R2; + f(5) = 0.187 * R2; + f(6) = 0.133 * R2; + } +}; + +template +void test_chem() { + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using RK_type = KokkosODE::Experimental::RK_type; + using solver_type = KokkosODE::Experimental::RungeKutta; + + { + chem_model_1 chem_model; + const int neqs = chem_model.neqs; + const int num_steps = 15000; + + KokkosODE::Experimental::ODE_params params(num_steps); + vec_type tmp("tmp vector", neqs); + mv_type kstack("k stack", neqs, solver_type::num_stages()); + + // Set initial conditions + vec_type y_new("solution", neqs); + vec_type y_old("initial conditions", neqs); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, 1); + RKSolve_wrapper + solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, + y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; + std::cout << "\nChem model 1" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" + << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; +#endif + } + + { + chem_model_2 chem_model; + const int neqs = chem_model.neqs; + const int num_steps = 1500; + + KokkosODE::Experimental::ODE_params params(num_steps); + vec_type tmp("tmp vector", neqs); + mv_type kstack("k stack", neqs, solver_type::num_stages()); + + // Set initial conditions + vec_type y_new("solution", neqs); + vec_type y_old("initial conditions", neqs); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 0.25; + y_old_h(1) = 0.25; + y_old_h(2) = 0; + y_old_h(3) = 0; + y_old_h(4) = 0; + y_old_h(5) = 0; + y_old_h(6) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, 1); + RKSolve_wrapper + solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, + y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; + std::cout << "\nChem model 2" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " + << y_old_h(2) << ", " << y_old_h(3) << ", " << y_old_h(4) << ", " + << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " + << y_new_h(2) << ", " << y_new_h(3) << ", " << y_new_h(4) << ", " + << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; +#endif + } +} // test_chem +} // namespace Test + +int test_chem_models() { + Test::test_chem(); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, RK_chem_models) { test_chem_models(); } +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in b/ode/unit_test/backends/Test_Cuda_ODE.cpp similarity index 75% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in rename to ode/unit_test/backends/Test_Cuda_ODE.cpp index 5e6b197460..c901a6b116 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in +++ b/ode/unit_test/backends/Test_Cuda_ODE.cpp @@ -13,12 +13,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER +#ifndef TEST_CUDA_ODE_CPP +#define TEST_CUDA_ODE_CPP -#ifndef KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif +#include +#include + +#endif // TEST_CUDA_ODE_CPP diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in b/ode/unit_test/backends/Test_HIP_ODE.cpp similarity index 75% rename from blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in rename to ode/unit_test/backends/Test_HIP_ODE.cpp index 2ae20b772c..8f0d8838dc 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in +++ b/ode/unit_test/backends/Test_HIP_ODE.cpp @@ -13,12 +13,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER +#ifndef TEST_HIP_ODE_CPP +#define TEST_HIP_ODE_CPP -#ifndef KOKKOSBLAS_GESV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS_GESV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS_GESV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif +#include "Test_HIP.hpp" +#include "Test_ODE.hpp" + +#endif // TEST_HIP_ODE_CPP diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in b/ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp similarity index 75% rename from blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in rename to ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp index bdac3456e8..049f0778da 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in +++ b/ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp @@ -13,12 +13,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER +#ifndef TEST_OPENMPTARGET_ODE_CPP +#define TEST_OPENMPTARGET_ODE_CPP -#ifndef KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_SUM_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif +#include "Test_OpenMPTarget.hpp" +#include "Test_ODE.hpp" + +#endif // TEST_OPENMPTARGET_ODE_CPP diff --git a/ode/unit_test/backends/Test_OpenMP_ODE.cpp b/ode/unit_test/backends/Test_OpenMP_ODE.cpp new file mode 100644 index 0000000000..3cefeb4666 --- /dev/null +++ b/ode/unit_test/backends/Test_OpenMP_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_ODE_CPP +#define TEST_OPENMP_ODE_CPP + +#include +#include + +#endif // TEST_OPENMP_ODE_CPP diff --git a/ode/unit_test/backends/Test_SYCL_ODE.cpp b/ode/unit_test/backends/Test_SYCL_ODE.cpp new file mode 100644 index 0000000000..9fd7b8f034 --- /dev/null +++ b/ode/unit_test/backends/Test_SYCL_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_ODE_CPP +#define TEST_SYCL_ODE_CPP + +#include +#include + +#endif // TEST_SYCL_ODE_CPP diff --git a/ode/unit_test/backends/Test_Serial_ODE.cpp b/ode/unit_test/backends/Test_Serial_ODE.cpp new file mode 100644 index 0000000000..31ef4b0489 --- /dev/null +++ b/ode/unit_test/backends/Test_Serial_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_ODE_CPP +#define TEST_SERIAL_ODE_CPP + +#include +#include + +#endif // TEST_SERIAL_ODE_CPP diff --git a/ode/unit_test/backends/Test_Threads_ODE.cpp b/ode/unit_test/backends/Test_Threads_ODE.cpp new file mode 100644 index 0000000000..ff438a5883 --- /dev/null +++ b/ode/unit_test/backends/Test_Threads_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_ODE_CPP +#define TEST_THREADS_ODE_CPP + +#include +#include + +#endif // TEST_THREADS_ODE_CPP diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index bb083873f4..adfc336576 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -17,19 +17,24 @@ */ #ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_CONTEXT_HPP -#define KOKKOSKENERLS_PERFTEST_BENCHMARK_CONTEXT_HPP +#define KOKKOSKERNELS_PERFTEST_BENCHMARK_CONTEXT_HPP +#include "KokkosKernels_PrintConfiguration.hpp" + +#include #include #include #include +#include +#include namespace KokkosKernelsBenchmark { /// \brief Remove unwanted spaces and colon signs from input string. In case of /// invalid input it will return an empty string. -std::string remove_unwanted_characters(std::string str) { +inline std::string remove_unwanted_characters(std::string str) { auto from = str.find_first_not_of(" :"); auto to = str.find_last_not_of(" :"); @@ -43,9 +48,10 @@ std::string remove_unwanted_characters(std::string str) { /// \brief Extract all key:value pairs from kokkos configuration and add it to /// the benchmark context -void add_kokkos_configuration(bool verbose) { +inline void add_kokkos_configuration(bool verbose) { std::ostringstream msg; Kokkos::print_configuration(msg, verbose); + KokkosKernels::print_configuration(msg); // Iterate over lines returned from kokkos and extract key:value pairs std::stringstream ss{msg.str()}; @@ -62,10 +68,94 @@ void add_kokkos_configuration(bool verbose) { } } -/// \brief Gather all context information and add it to benchmark context data -void add_benchmark_context(bool verbose = false) { - // Add Kokkos configuration to benchmark context data +/// \brief Add Kokkos Kernels git info and google benchmark release to +/// benchmark context. +inline void add_version_info() { + using namespace KokkosKernels::Impl; + + if (!GIT_BRANCH.empty()) { + benchmark::AddCustomContext("GIT_BRANCH", std::string(GIT_BRANCH)); + benchmark::AddCustomContext("GIT_COMMIT_HASH", + std::string(GIT_COMMIT_HASH)); + benchmark::AddCustomContext("GIT_CLEAN_STATUS", + std::string(GIT_CLEAN_STATUS)); + benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", + std::string(GIT_COMMIT_DESCRIPTION)); + benchmark::AddCustomContext("GIT_COMMIT_DATE", + std::string(GIT_COMMIT_DATE)); + } + if (!BENCHMARK_VERSION.empty()) { + benchmark::AddCustomContext("GOOGLE_BENCHMARK_VERSION", + std::string(BENCHMARK_VERSION)); + } +} + +inline void add_env_info() { + auto num_threads = std::getenv("OMP_NUM_THREADS"); + if (num_threads) { + benchmark::AddCustomContext("OMP_NUM_THREADS", num_threads); + } + auto dynamic = std::getenv("OMP_DYNAMIC"); + if (dynamic) { + benchmark::AddCustomContext("OMP_DYNAMIC", dynamic); + } + auto proc_bind = std::getenv("OMP_PROC_BIND"); + if (proc_bind) { + benchmark::AddCustomContext("OMP_PROC_BIND", proc_bind); + } + auto places = std::getenv("OMP_PLACES"); + if (places) { + benchmark::AddCustomContext("OMP_PLACES", places); + } +} + +/// \brief Gather all context information and add it to benchmark context +inline void add_benchmark_context(bool verbose = false) { add_kokkos_configuration(verbose); + add_version_info(); + add_env_info(); +} + +template +inline auto register_benchmark(const char* name, FuncType func, + std::vector arg_names, + std::vector args, int repeat, + ArgsToCallOp&&... func_args) { + if (repeat > 0) { + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime() + ->Iterations(repeat); + } else { + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); + } +} + +template +inline auto register_benchmark_real_time(const char* name, FuncType func, + std::vector arg_names, + std::vector args, int repeat, + ArgsToCallOp&&... func_args) { + if (repeat > 0) { + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) + ->ArgNames(arg_names) + ->Args(args) + ->UseRealTime() + ->Iterations(repeat); + } else { + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) + ->ArgNames(arg_names) + ->Args(args) + ->UseRealTime(); + } } } // namespace KokkosKernelsBenchmark diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 28752e9c6c..cf1905d6d4 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -27,7 +27,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) - + if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) #Add RPS implementations of KK perf tests here KOKKOSKERNELS_ADD_EXECUTABLE( @@ -48,104 +48,19 @@ if (KokkosKernels_ENABLE_PERFTESTS) ADD_COMPONENT_SUBDIRECTORY(graph) ADD_COMPONENT_SUBDIRECTORY(sparse) ADD_COMPONENT_SUBDIRECTORY(blas) + ADD_COMPONENT_SUBDIRECTORY(ode) ADD_SUBDIRECTORY(performance) #ADD_SUBDIRECTORY(common) endif() -IF(KokkosKernels_ENABLE_BENCHMARK) - - IF (KOKKOSKERNELS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") - ENDIF() - - find_package(benchmark QUIET) - - IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") - ELSE() - message(STATUS "No installed google benchmark found, fetching from GitHub") - include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) - - list(APPEND CMAKE_MESSAGE_INDENT " ") - #Note: recent bug (google/benchmark#1441) is preventing us from using - # the latest benchmark release. - FetchContent_Declare( - googlebenchmark - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b - ) - FetchContent_MakeAvailable(googlebenchmark) - list(POP_BACK CMAKE_MESSAGE_INDENT) - - include_directories(${benchmark_SOURCE_DIR}/include) - - # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() - - target_compile_options(benchmark PRIVATE -w) - target_compile_options(benchmark_main PRIVATE -w) - ENDIF() - - KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - - FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() - - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkoskernels - ) - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE CXX - ) - ENDFOREACH() - - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) - - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) - ENDFUNCTION() - - SET( - BENCHMARK_SOURCES - BenchmarkMain.cpp - ) - +if(KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + SOURCES + blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp + blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp + blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp + BenchmarkMain.cpp ) - endif() diff --git a/perf_test/KokkosKernels_perf_test_instantiation.hpp b/perf_test/KokkosKernels_perf_test_instantiation.hpp new file mode 100644 index 0000000000..9ed5ec23bc --- /dev/null +++ b/perf_test/KokkosKernels_perf_test_instantiation.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +// +// Created by Berger-Vergiat, Luc on 2/6/23. +// + +#ifndef KOKKOSKERNELS_PERF_TEST_INSTANTIATION_HPP +#define KOKKOSKERNELS_PERF_TEST_INSTANTIATION_HPP + +#include "KokkosKernels_perf_test_utilities.hpp" + +#ifndef KOKKOSKERNELS_PERF_TEST_NAME +#error "The macro KOKKOSKERNELS_PERF_TEST_NAME was not defined" +#endif + +int main_instantiation(int argc, char** argv) { + perf_test::CommonInputParams params; + perf_test::parse_common_options(argc, argv, params); + + /* Assumption is that use_openmp/use_threads variables are */ + /* provided as numbers of threads */ + int num_threads = 1; + if (params.use_openmp) { + num_threads = params.use_openmp; + } else if (params.use_threads) { + num_threads = params.use_threads; + } + + int device_id = 0; + if (params.use_cuda) + device_id = params.use_cuda - 1; + else if (params.use_hip) + device_id = params.use_hip - 1; + else if (params.use_sycl) + device_id = params.use_sycl - 1; + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + Kokkos::print_configuration(std::cout); + std::cout << '\n'; + + bool ran = false; + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + std::cout << "Running on OpenMP backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + std::cout << "Running on Threads backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: Threads requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + std::cout << "Running on Cuda backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + std::cout << "Running on HIP backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + std::cout << "Running on SYCL backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, + params); + ran = true; +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (!ran) { +#if defined(KOKKOS_ENABLE_SERIAL) + std::cout << "Running on Serial backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); +#else + std::cout << "ERROR: Tried to run on Serial device (as no parallel" + " backends requested), but Serial is not enabled.\n"; + Kokkos::finalize(); + return 1; +#endif + } + Kokkos::finalize(); + return 0; +} + +// Undefine the macro to avoid potential bad interaction +// with other parts of the code... +#undef KOKKOSKERNELS_PERF_TEST_NAME + +#endif // KOKKOSKERNELS_PERF_TEST_INSTANTIATION_HPP diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp new file mode 100644 index 0000000000..0df96f4494 --- /dev/null +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -0,0 +1,194 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +// +// Created by Berger-Vergiat, Luc on 2/6/23. +// + +#ifndef KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP +#define KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP + +#include "KokkosKernels_TestUtils.hpp" // for string_compare_no_case + +// Namepsace that defines common utilities +// for performance tests +namespace perf_test { + +struct CommonInputParams { + int use_cuda = 0; + int use_hip = 0; + int use_sycl = 0; + int use_openmp = 0; + int use_threads = 0; + + int repeat = 0; +}; + +std::string list_common_options() { + std::ostringstream common_options; + common_options + << "\t[Required] BACKEND:\n" + << "\t\t'--threads [numThreads]' |\n" + << "\t\t'--openmp [numThreads]' |\n" + << "\t\t'--cuda [deviceIndex]' |\n" + << "\t\t'--hip [deviceIndex]' |\n" + << "\t\t'--sycl [deviceIndex]'\n\n" + << "\tIf no parallel backend is requested, Serial will be used " + "(if enabled)\n\n"; + + return common_options.str(); +} + +void process_arg_int(char const* str_val, int& val) { + errno = 0; + char* ptr_end; + val = std::strtol(str_val, &ptr_end, 10); + + if (str_val == ptr_end) { + std::stringstream ss; + ss << "Error: cannot convert command line argument '" << str_val + << "' to an integer.\n"; + throw std::invalid_argument(ss.str()); + } + + if (errno == ERANGE) { + std::stringstream ss; + ss << "Error: converted value for command line argument '" << str_val + << "' falls out of range.\n"; + throw std::invalid_argument(ss.str()); + } +} + +void process_arg_double(char const* str_val, double& val) { + errno = 0; + char* ptr_end; + val = std::strtod(str_val, &ptr_end); + + if (str_val == ptr_end) { + std::stringstream ss; + ss << "Error: cannot convert command line argument '" << str_val + << "' to a double.\n"; + throw std::invalid_argument(ss.str()); + } + + if (errno == ERANGE) { + std::stringstream ss; + ss << "Error: converted value for command line argument '" << str_val + << "' falls out of range.\n"; + throw std::invalid_argument(ss.str()); + } +} + +bool check_arg_int(int const i, int const argc, char** argv, char const* name, + int& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + process_arg_int(argv[i + 1], val); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by an int"; + throw std::invalid_argument(msg.str()); + } + return true; +} + +bool check_arg_double(int const i, int const argc, char** argv, + char const* name, double& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + process_arg_double(argv[i + 1], val); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by a real number"; + throw std::invalid_argument(msg.str()); + } + return true; +} + +bool check_arg_bool(int const i, int const /*argc*/, char** argv, + char const* name, bool& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + val = true; + return true; +} + +bool check_arg_str(int const i, int const argc, char** argv, char const* name, + std::string& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + val = std::string(argv[i + 1]); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by a string"; + throw std::invalid_argument(msg.str()); + } + return true; +} + +void parse_common_options(int& argc, char** argv, CommonInputParams& params) { + // Skip the program name, start with argIdx=1 + int argIdx = 1; + // Note: after parsing a GPU device ID, always add 1 to it. + // If e.g. params.use_cuda is 0, that means CUDA will not be used at all. + // But if it's N, then it means run on CUDA device N-1. + while (argIdx < argc) { + bool remove_flag = false; + if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--openmp", + params.use_openmp)) { + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { + params.use_cuda++; + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--hip", params.use_hip)) { + params.use_hip++; + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--sycl", params.use_sycl)) { + params.use_sycl++; + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--repeat", params.repeat)) { + remove_flag = true; + } + + if (remove_flag) { + // Shift the remainder of the argv list by one. Note that argv has + // (argc + 1) arguments, the last one always being nullptr. The following + // loop moves the trailing nullptr element as well + for (int k = argIdx; k < argc - 1; ++k) { + argv[k] = argv[k + 2]; + argv[k + 1] = argv[k + 3]; + } + argc = argc - 2; + } else { + ++argIdx; + } + } +} // parse_common_options() + +} // namespace perf_test + +#endif // KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index 5d6bf72450..314439b6c0 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 950674e39e..3f15ca0b2d 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -178,7 +178,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// diff --git a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp index 40c0ad8f3d..5bf6061fe4 100644 --- a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp +++ b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// @@ -220,8 +220,7 @@ int main(int argc, char *argv[]) { using Layout = typename AMatrixValueViewLL::array_layout; using EXSP = typename AMatrixValueViewLL::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp index 6c6e88b8e0..c0ce8f0bd4 100644 --- a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp +++ b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp @@ -50,7 +50,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; /// /// input arguments parsing @@ -250,8 +250,7 @@ int main(int argc, char *argv[]) { using Layout = typename AMatrixValueViewLL::array_layout; using EXSP = typename AMatrixValueViewLL::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp index 663ea400be..1eaacbde5e 100644 --- a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp +++ b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp @@ -141,9 +141,9 @@ void readCRSFromMM(std::string name, const VType &V, const IntType &r, r_h(tmp_row) = i; current_row = read_row; - // if (VType::Rank == 1) + // if (VType::rank == 1) // input >> V_h(i); - if (VType::Rank == 2) + if (VType::rank == 2) for (size_t j = 0; j < V_h.extent(0); ++j) input >> V_h(j, i); } diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp index 35efd40a16..17b8ad6d3e 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp @@ -23,7 +23,7 @@ struct BSPMV_Functor_View { typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; typedef typename AMatrix::non_const_value_type entries_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type* alpha; const AMatrix m_A_values; diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp index 81d828c51d..06ea55e303 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp @@ -126,7 +126,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp index 546cc84cab..2294c23805 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp @@ -163,7 +163,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; /// /// input arguments parsing diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp index d1a21b3053..808e235edc 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp @@ -381,7 +381,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; /// /// input arguments parsing diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp new file mode 100644 index 0000000000..1e537ceadc --- /dev/null +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -0,0 +1,142 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosBlas_dot_perf_test.hpp" +#include + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// The Level 1 BLAS perform scalar, vector and vector-vector operations; +// +// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-1%3A%3Adot +// +// Usage: result = KokkosBlas::dot(x,y); KokkosBlas::dot(r,x,y); +// Multiplies each value of x(i) [x(i,j)] with y(i) or [y(i,j)] and computes the +// sum. (If x and y have scalar type Kokkos::complex, the complex conjugate of +// x(i) or x(i,j) will be used.) VectorX: A rank-1 Kokkos::View VectorY: A +// rank-1 Kokkos::View ReturnVector: A rank-0 or rank-1 Kokkos::View +// +// REQUIREMENTS: +// Y.rank == 1 or X.rank == 1 +// Y.extent(0) == X.extent(0) + +// Dot Test design: +// 1) create 1D View containing 1D matrix, aka a vector; this will be your X +// input matrix; 2) create 1D View containing 1D matrix, aka a vector; this will +// be your Y input matrix; 3) perform the dot operation on the two inputs, and +// capture result in "result" + +// Here, m represents the desired length for each 1D matrix; +// "m" is used here, because code from another test was adapted for this test. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + const auto repeat = state.range(2); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m, n); + + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m, n); + + Kokkos::View result( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x dot y"), n); + + // Declaring variable pool w/ a seeded random number; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + Kokkos::fill_random(x, pool, 10.0); + Kokkos::fill_random(y, pool, 10.0); + + for (auto _ : state) { + // do a warm up run of dot: + KokkosBlas::dot(result, x, y); + + // The live test of dot: + + Kokkos::fence(); + Kokkos::Timer timer; + + for (int i = 0; i < repeat; i++) { + KokkosBlas::dot(result, x, y); + ExecSpace().fence(); + } + + // Kokkos Timer set up + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation for a 1D matrix dot product per test run; + size_t flopsPerRun = (size_t)2 * m * n; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg DOT time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg DOT FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->Name("KokkosBlas_dot_mv") + ->ArgNames({"m", "n", "repeat"}) + ->Args({100000, 5, 20}) + ->UseManualTime(); diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp new file mode 100644 index 0000000000..14957994d1 --- /dev/null +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -0,0 +1,140 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosBlas_dot_perf_test.hpp" +#include + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// The Level 1 BLAS perform scalar, vector and vector-vector operations; +// +// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-1%3A%3Adot +// +// Usage: result = KokkosBlas::dot(x,y); KokkosBlas::dot(r,x,y); +// Multiplies each value of x(i) [x(i,j)] with y(i) or [y(i,j)] and computes the +// sum. (If x and y have scalar type Kokkos::complex, the complex conjugate of +// x(i) or x(i,j) will be used.) VectorX: A rank-1 Kokkos::View VectorY: A +// rank-1 Kokkos::View ReturnVector: A rank-0 or rank-1 Kokkos::View +// +// REQUIREMENTS: +// Y.rank == 1 or X.rank == 1 +// Y.extent(0) == X.extent(0) + +// Dot Test design: +// 1) create 1D View containing 1D matrix, aka a vector; this will be your X +// input matrix; 2) create 1D View containing 1D matrix, aka a vector; this will +// be your Y input matrix; 3) perform the dot operation on the two inputs, and +// capture result in "result" + +// Here, m represents the desired length for each 1D matrix; +// "m" is used here, because code from another test was adapted for this test. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto repeat = state.range(1); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + // Create 1D view w/ Device as the ExecSpace; this is an input vector + // A(view_alloc(WithoutInitializing, "label"), m, n); + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m); + + // Create 1D view w/ Device as the ExecSpace; this is the output vector + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m); + + // Declaring variable pool w/ a seeded random number; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + Kokkos::fill_random(x, pool, 10.0); + Kokkos::fill_random(y, pool, 10.0); + + for (auto _ : state) { + // do a warm up run of dot: + KokkosBlas::dot(x, y); + + // The live test of dot: + Kokkos::fence(); + Kokkos::Timer timer; + + for (int i = 0; i < repeat; i++) { + KokkosBlas::dot(x, y); + ExecSpace().fence(); + } + + // Kokkos Timer set up + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation for a 1D matrix dot product per test run; + size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg DOT time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg DOT FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->Name("KokkosBlas_dot") + ->ArgNames({"m", "repeat"}) + ->Args({100000, 1}) + ->UseManualTime(); diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp new file mode 100644 index 0000000000..165f7fe6db --- /dev/null +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -0,0 +1,146 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include "KokkosKernels_TestUtils.hpp" + +#include + +// Functor to handle the case of a "without Cuda" build +template +struct teamDotFunctor { + // Compile - time check to see if your data type is a Kokkos::View: + static_assert(Kokkos::is_view::value, + "Vector is not a Kokkos::View."); + + using Scalar = typename Vector::non_const_value_type; + // Vector is templated on memory space + using execution_space = ExecSpace; // Kokkos Execution Space + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + + // Declare Kokkos::View Vectors, x and y + Vector x; + Vector y; + + // Functor instead of KOKKOS_LAMBDA expression + + KOKKOS_INLINE_FUNCTION void operator()(const team_member& team) const { + KokkosBlas::Experimental::dot(team, x, y); + } + // Constructor + teamDotFunctor(Vector X_, Vector Y_) { + x = X_; + y = Y_; + } +}; + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto repeat = state.range(1); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + + // For the Team implementation of dot; ExecSpace is implicit; + using policy = Kokkos::TeamPolicy; + + // Create 1D view w/ Device as the ExecSpace; this is an input vector + Kokkos::View x("X", m); + // Create 1D view w/ Device as the ExecSpace; this is the output vector + Kokkos::View y("Y", m); + + // Here, deep_copy is filling / copying values into Host memory from Views X + // and Y + Kokkos::deep_copy(x, 3.0); + Kokkos::deep_copy(y, 2.0); + + std::cout << "Running BLAS Level 1 Kokkos Teams-based implementation DOT " + "performance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + for (auto _ : state) { + // Warm up run of dot: + teamDotFunctor, ExecSpace> + teamDotFunctorWarmUpInstance(x, y); + + Kokkos::parallel_for("TeamDotUsage -- Warm Up Run", policy(1, Kokkos::AUTO), + teamDotFunctorWarmUpInstance); + + // The live test of dot: + + Kokkos::fence(); + Kokkos::Timer timer; + + teamDotFunctor, ExecSpace> + teamDotFunctorLiveTestInstance(x, y); + Kokkos::parallel_for("TeamDotUsage -- Live Test", policy(1, Kokkos::AUTO), + teamDotFunctorLiveTestInstance); + + // Kokkos Timer set up and data capture + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation for a 1D matrix dot product per test run; + size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg DOT time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg DOT FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->Name("KokkosBlas_team_dot/run") + ->ArgNames({"m", "repeat"}) + ->Args({100000, 1}) + ->UseManualTime(); diff --git a/perf_test/blas/blas2/CMakeLists.txt b/perf_test/blas/blas2/CMakeLists.txt index f69c576cd3..9c2aa424d1 100644 --- a/perf_test/blas/blas2/CMakeLists.txt +++ b/perf_test/blas/blas2/CMakeLists.txt @@ -5,3 +5,10 @@ KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas2_gemv_perf_test SOURCES KokkosBlas2_gemv_perf_test.cpp ) + +IF(KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + Blas2_Benchmark + SOURCES KokkosBlas2_gemv_perf_test_benchmark.cpp + ) +ENDIF() diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp new file mode 100644 index 0000000000..962328eb95 --- /dev/null +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -0,0 +1,223 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include "KokkosBlas2_gemv.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include + +struct blas2_gemv_params : public perf_test::CommonInputParams { + int m = 5000; + int n = 5000; + bool layoutLeft = true; + + static blas2_gemv_params get_params(int& argc, char** argv) { + blas2_gemv_params params; + perf_test::parse_common_options(argc, argv, params); + + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (std::string layout; + perf_test::check_arg_str(i, argc, argv, "--layout", layout)) { + if (0 == Test::string_compare_no_case(layout, "left")) + params.layoutLeft = true; + else if (0 == Test::string_compare_no_case(layout, "right")) + params.layoutLeft = false; + else { + std::cerr << "Invalid layout: must be 'left' or 'right'.\n"; + exit(1); + } + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + exit(1); + } + } + return params; + } + + static void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --m :: number of rows to generate (default 5000)" + << std::endl; + std::cerr + << "\t[Optional] --n :: number of cols to generate (default 5000)" + << std::endl; + std::cerr << "\t[Optional] --layout :: matrix layout ('left' or 'right', " + "default 'left')" + << std::endl; + } +}; + +template +static void KokkosBlas2_GEMV(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + + // Declare type aliases + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + // Create a View containing a 2D matrix; allocate KokkosView with template + // args of Scalar**, a layout, and + Kokkos::View A( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + // Create Views containing 1D matrix; allocate (without) matrix "x" of size n + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), n); + // Create Views containing 1D matrix; allocate (without) matrix "y" of size m + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m); + + // Declaring variable pool w/ a number seed; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + // Fill 2D Matrix "A" and 1D matrix (i.e., a vector) "x" with random values; + // Here, 10 is the max value of the random generator between 1 and 10 + // (uniform ) + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(x, pool, 10.0); + + // Do a warm-up run + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + Kokkos::fence(); + double total_time = 0.0; + + for (auto _ : state) { + // Start timing + Kokkos::Timer timer; + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + ExecSpace().fence(); + + double time = timer.seconds(); + total_time += time; + state.SetIterationTime(time); + } + + state.counters[ExecSpace::name()] = 1; + state.counters["Avg GEMV time (s):"] = + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); + size_t flopsPerRun = (size_t)2 * m * n; + state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); +} + +template +void run(const blas2_gemv_params& params) { + using Scalar = double; + + const auto name = "KokkosBlas2_GEMV"; + const auto arg_names = std::vector{ + "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; + const auto args = std::vector{params.m, params.n, 1}; + + if (params.layoutLeft) { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GEMV, + arg_names, args, params.repeat); + } else { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GEMV, + arg_names, args, params.repeat); + } +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + const auto params = blas2_gemv_params::get_params(argc, argv); + + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + run(params); +#else + std::cout << "ERROR: PThreads requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + run(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + run(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + run(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + + // use serial if no backend is specified + if (!params.use_cuda and !params.use_hip and !params.use_openmp and + !params.use_sycl and !params.use_threads) { +#if defined(KOKKOS_ENABLE_SERIAL) + run(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + return 0; +} diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index 90097b86f8..80c9d25c1c 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -12,3 +12,9 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosBlas3_gemm_standalone_perf_test.cpp ) +IF(KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + Blas3_gemm_benchmark + SOURCES KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp + ) +ENDIF() diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index e1137aaeea..8fe23d2515 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -25,14 +25,14 @@ #include +#include "KokkosBatched_HostLevel_Gemm.hpp" #include "KokkosBatched_Gemm_Decl.hpp" -#include "KokkosBatched_Gemm_Serial_Impl.hpp" -//#include "KokkosBatched_Gemm_Team_Impl.hpp" -//#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" #include "gtest/gtest.h" // EXPECT_NEAR #include "KokkosKernels_TestUtils.hpp" +#include + #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #include "armpl.h" #else @@ -464,80 +464,99 @@ void __do_gemm_parallel_batched_heuristic_template(options_t options, STATUS; if (a == 'N' && b == 'N') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - + } } else if (a == 'N' && b == 'T') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + } //} else if (a == 'N' && b == 'C') { // __do_gemm_serial_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + } } else if (a == 'T' && b == 'T') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + } //} else if (a == 'T' && b == 'C') { // __do_gemm_serial_batched_template(options, gemm_args); //} else if (a == 'C' && b == 'N') { @@ -1334,7 +1353,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { simd_view_type C("C", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); - // uint64_t seed = Kokkos::Impl::clock_tic(); + // uint64_t seed = + // std::chrono::high_resolution_clock::now().time_since_epoch().count(); // Kokkos::Random_XorShift64_Pool rand_pool(seed); // Kokkos::fill_random(A, rand_pool, // Kokkos::rand, @@ -1444,7 +1464,7 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); @@ -1914,7 +1934,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { using execution_space = typename device_type::execution_space; gemm_args_t gemm_args; - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp new file mode 100644 index 0000000000..32d91e6b33 --- /dev/null +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -0,0 +1,215 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosBlas3_gemm.hpp" +#include +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" +#include "Benchmark_Context.hpp" +#include + +struct blas3_gemm_params : public perf_test::CommonInputParams { + int m = 1000; + int n = 1000; + int k = 1000; + + static blas3_gemm_params get_params(int& argc, char** argv) { + blas3_gemm_params params; + perf_test::parse_common_options(argc, argv, params); + + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--k", params.k)) { + ++i; + } else if (std::string(argv[i]).find("--benchmark") == 0) { + continue; // ignore benchmark arguments + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + exit(1); + } + } + return params; + } + + static void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --m :: Rows in A (default 1000)" + << std::endl; + std::cerr + << "\t[Optional] --n :: Columns in A / Rows in B (default 1000)" + << std::endl; + std::cerr << "\t[Optional] --k :: Columns in B (default 1000)" + << std::endl; + } +}; + +template +static void KokkosBlas3_GEMM(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + const auto k = state.range(2); + + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + Kokkos::View A( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + Kokkos::View B( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n, k); + Kokkos::View C( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C"), m, k); + Kokkos::Random_XorShift64_Pool pool(123); + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(B, pool, 10.0); + + // Do a warm-up run + KokkosBlas::gemm("N", "N", 1.0, A, B, 0.0, C); + Kokkos::fence(); + double total_time = 0.0; + + for (auto _ : state) { + Kokkos::Timer timer; + KokkosBlas::gemm("N", "N", 1.0, A, B, 0.0, C); + ExecSpace().fence(); + + double time = timer.seconds(); + total_time += time; + state.SetIterationTime(time); + } + + state.counters[ExecSpace::name()] = 1; + state.counters["Avg GEMM time (s):"] = + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); + size_t flopsPerRun = (size_t)2 * m * n * k; + state.counters["Avg GEMM (FLOP/s):"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); + if constexpr (std::is_same_v) { + state.counters["Memory Layout in A: LayoutLeft"] = 1; + } else { + state.counters["Memory Layout in A: LayoutRight"] = 1; + } + if constexpr (std::is_same_v) { + state.counters["Memory Layout in B: LayoutLeft"] = 1; + } else { + state.counters["Memory Layout in B: LayoutRight"] = 1; + } +} + +template +void run(const blas3_gemm_params& params) { + using LL = Kokkos::LayoutLeft; + using LR = Kokkos::LayoutRight; + using Scalar = double; + + const auto name = "KokkosBlas3_GEMM"; + const auto arg_names = std::vector{"m", "n", "k"}; + const auto args = std::vector{params.m, params.n, params.k}; + + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); +} + +int main(int argc, char** argv) { + const auto params = blas3_gemm_params::get_params(argc, argv); + const int num_threads = params.use_openmp; + const int device_id = params.use_cuda - 1; + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + run(params); +#else + std::cout << "ERROR: PThreads requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + run(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + run(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + run(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + + // use serial if no backend is specified + if (!params.use_cuda and !params.use_hip and !params.use_openmp and + !params.use_sycl and !params.use_threads) { +#if defined(KOKKOS_ENABLE_SERIAL) + run(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + return 0; +} diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 7f7e8e25ad..90f7a90617 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -27,6 +27,8 @@ #include "KokkosBatched_Trmm_Serial_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include + //#define PERF_TEST_DEBUG // Forward declarations @@ -611,7 +613,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { using execution_space = typename device_type::execution_space; trmm_args_t trmm_args; - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); decltype(dim.a.m) min_dim = dim.a.m < dim.a.n ? dim.a.m : dim.a.n; typename vta::HostMirror host_A; diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index 7a81a191bb..cbadcef0b1 100644 --- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -27,6 +27,8 @@ #include "KokkosBatched_Trtri_Serial_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include + //#define TRTRI_PERF_TEST_DEBUG // Forward declarations @@ -436,7 +438,8 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { using execution_space = typename device_type::execution_space; trtri_args_t trtri_args; - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); decltype(dim.a.m) min_dim = dim.a.m < dim.a.n ? dim.a.m : dim.a.n; typename vta::HostMirror host_A; diff --git a/perf_test/graph/CMakeLists.txt b/perf_test/graph/CMakeLists.txt index 134a7acc2e..26eab42ed4 100644 --- a/perf_test/graph/CMakeLists.txt +++ b/perf_test/graph/CMakeLists.txt @@ -16,10 +16,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosGraph_mis_d2.cpp ) - -#Below will probably fail on GPUs. -#KOKKOSKERNELS_ADD_EXECUTABLE( -# graph_triangle -# SOURCES KokkosGraph_triangle.cpp -# ) +KOKKOSKERNELS_ADD_EXECUTABLE( + graph_triangle + SOURCES KokkosGraph_triangle.cpp + ) diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cc969e52a1..57f241d7b1 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -379,7 +379,7 @@ void run_experiment(crsGraph_t crsGraph, int num_cols, Parameters params) { } } - if (params.coloring_output_file != NULL) { + if (params.coloring_output_file != "") { std::ofstream os(params.coloring_output_file, std::ofstream::out); KokkosKernels::Impl::print_1Dview(os, colors, true, "\n"); } @@ -420,7 +420,7 @@ void run_multi_mem_experiment(Parameters params) { // typedef typename slow_graph_t::entries_type::const_type // const_slow_cols_view_t; - char *a_mat_file = params.a_mtx_bin_file; + const char *a_mat_file = params.a_mtx_bin_file.c_str(); // char *b_mat_file = params.b_mtx_bin_file; // char *c_mat_file = params.c_mtx_bin_file; @@ -581,7 +581,7 @@ int main(int argc, char **argv) { if (parse_inputs(params, argc, argv)) { return 1; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a matrix file" << std::endl; return 0; } diff --git a/perf_test/graph/KokkosGraph_multimem_triangle.hpp b/perf_test/graph/KokkosGraph_multimem_triangle.hpp deleted file mode 100644 index f7875fed0e..0000000000 --- a/perf_test/graph/KokkosGraph_multimem_triangle.hpp +++ /dev/null @@ -1,205 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosGraph_run_triangle.hpp" -#include "KokkosSparse_CrsMatrix.hpp" - -namespace KokkosKernels { - -namespace Experiment { - -template -void run_multi_mem_triangle(Parameters params) { - typedef exec_space myExecSpace; - typedef Kokkos::Device myFastDevice; - typedef Kokkos::Device mySlowExecSpace; - - typedef typename KokkosSparse::CrsMatrix - fast_crstmat_t; - typedef typename fast_crstmat_t::StaticCrsGraphType fast_graph_t; - - typedef typename KokkosSparse::CrsMatrix - slow_crstmat_t; - typedef typename slow_crstmat_t::StaticCrsGraphType slow_graph_t; - - char *a_mat_file = params.a_mtx_bin_file; - // char *b_mat_file = params.b_mtx_bin_file; - // char *c_mat_file = params.c_mtx_bin_file; - - slow_graph_t a_slow_crsgraph, /*b_slow_crsgraph,*/ c_slow_crsgraph; - fast_graph_t a_fast_crsgraph, /*b_fast_crsgraph,*/ c_fast_crsgraph; - - // read a and b matrices and store them on slow or fast memory. - if (params.a_mem_space == 1) { - fast_crstmat_t a_fast_crsmat; - a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); - a_fast_crsgraph = a_fast_crsmat.graph; - a_fast_crsgraph.num_cols = a_fast_crsmat.numCols(); - - } else { - slow_crstmat_t a_slow_crsmat; - a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); - a_slow_crsgraph = a_slow_crsmat.graph; - a_slow_crsgraph.num_cols = a_slow_crsmat.numCols(); - } - - if (params.a_mem_space == 1) { - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } - } - } - } else { - // A is in slow memory - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } - } - } - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp deleted file mode 100644 index 30d1ec77f6..0000000000 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ /dev/null @@ -1,290 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosGraph_Triangle.hpp" -#include "KokkosKernels_TestParameters.hpp" - -#define TRANPOSEFIRST false -#define TRANPOSESECOND false - -namespace KokkosKernels { - -namespace Experiment { -template -bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2) { - // typedef typename crsGraph_t::StaticCrsGraphType crsGraph_t; - typedef typename crsGraph_t::row_map_type::non_const_type lno_view_t; - typedef typename crsGraph_t::entries_type::non_const_type lno_nnz_view_t; - // typedef typename crsGraph_t::values_type::non_const_type scalar_view_t; - - size_t nrows1 = output_mat1.row_map.extent(0); - size_t nentries1 = output_mat1.entries.extent(0); - - size_t nrows2 = output_mat2.row_map.extent(0); - size_t nentries2 = output_mat2.entries.extent(0); - // size_t nvals2 = output_mat2.values.extent(0); - - KokkosKernels::sort_crs_graph( - output_mat1.graph.row_map, output_mat1.entries); - - if (nrows1 != nrows2) return false; - if (nentries1 != nentries2) return false; - - KokkosKernels::sort_crs_graph( - output_mat2.graph.row_map, output_mat2.entries); - - bool is_identical = true; - is_identical = KokkosKernels::Impl::kk_is_identical_view< - typename crsGraph_t::row_map_type, typename crsGraph_t::row_map_type, - typename lno_view_t::value_type, typename device::execution_space>( - output_mat1.row_map, output_mat2.row_map, 0); - if (!is_identical) return false; - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, - typename device::execution_space>(output_mat1.entries, - output_mat2.entries, 0); - if (!is_identical) return false; - - if (!is_identical) { - std::cout << "Incorret values" << std::endl; - } - return true; -} - -template -struct Flush { - typedef double value_type; - - // flush a large host buffer - Kokkos::View _buf; - Flush(int flush_option) : _buf("Flush::buf", BufSize) { - Kokkos::deep_copy(_buf, 1); - Kokkos::fence(); - if (flush_option == 2) { - for (size_t i = 0; i < BufSize; ++i) { - _buf(i) = rand(); - } - } - } - - KOKKOS_INLINE_FUNCTION - void init(value_type &update) { update = 0; } - - KOKKOS_INLINE_FUNCTION - void join(value_type &update, const value_type &input) { update += input; } - - KOKKOS_INLINE_FUNCTION - void operator()(const int i, value_type &update) const { update += _buf[i]; } - - void run() { - double sum = 0; - Kokkos::parallel_reduce( - "KokkosGraph::PerfTest::Flush", - Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, - sum); - SpaceType().fence(); - std::cout << "Flush sum:" << sum << std::endl; - FILE *fp = fopen("/dev/null", "w"); - fprintf(fp, "%f\n", sum); - fclose(fp); - - /* - #pragma omp parallel - { - const size_t cache_line = 64; - const char *cp = (const char *) _buf.data(); - size_t i = 0; - - - for (i = 0; i < BufSize; i += cache_line) { - asm volatile("clflush (%0)\n\t" - : - : "r"(&cp[i]) - : "memory"); - } - - asm volatile("sfence\n\t" - : - : - : "memory"); - } - */ - } -}; - -template -void run_experiment(crsGraph_t crsGraph, Parameters params) { - // using namespace KokkosSparse; - using namespace KokkosSparse; - using namespace KokkosGraph::Experimental; - // using namespace KokkosSparse::Experimental; - - int algorithm = params.algorithm; - int repeat = params.repeat; - int chunk_size = params.chunk_size; - - int shmemsize = params.shmemsize; - int team_size = params.team_size; - int use_dynamic_scheduling = params.use_dynamic_scheduling; - int verbose = params.verbose; - - int accumulator = params.accumulator; - // char spgemm_step = params.spgemm_step; - int vector_size = params.vector_size; - - // spgemm_step++; - - typedef typename crsGraph_t3::row_map_type::non_const_type lno_view_t; - typedef typename crsGraph_t3::entries_type::non_const_type lno_nnz_view_t; - - Kokkos::View row_mapC; - lno_nnz_view_t entriesC; - lno_nnz_view_t valuesC; - - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename lno_view_t::value_type size_type; - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, lno_t, ExecSpace, TempMemSpace, PersistentMemSpace> - KernelHandle; - - KernelHandle kh; - kh.set_team_work_size(chunk_size); - kh.set_shmem_size(shmemsize); - kh.set_suggested_team_size(team_size); - kh.set_suggested_vector_size(vector_size); - - if (use_dynamic_scheduling) { - kh.set_dynamic_scheduling(true); - } - if (verbose) { - kh.set_verbose(true); - } - const lno_t m = crsGraph.numRows(); - ; - - for (int i = 0; i < repeat; ++i) { - size_type rowmap_size = crsGraph.entries.extent(0); - switch (algorithm) { - case 16: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_AI); - rowmap_size = m; - break; - case 17: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); - std::cout << "IA" << std::endl; - break; - case 18: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA_UNION); break; - case 19: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LL); - rowmap_size = m; - break; - case 20: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LU); - rowmap_size = m; - break; - default: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); break; - } - - kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); - - kh.get_spgemm_handle()->set_sort_lower_triangular(params.right_sort); - kh.get_spgemm_handle()->set_create_lower_triangular( - params.right_lower_triangle); - kh.get_spgemm_handle()->set_compression(params.apply_compression); - kh.get_spgemm_handle()->set_sort_option(params.sort_option); - kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); - - switch (accumulator) { - case 0: - default: - kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DEFAULT); - break; - case 1: - kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DENSE); - break; - case 2: - kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_SPARSE); - break; - } - - constexpr size_t LLC_CAPACITY = 256 * 4 * 1024 * 1024; - if (params.cache_flush) { - std::cout << "Flushing cache with option:" << params.cache_flush - << std::endl; - Flush flush(params.cache_flush); - flush.run(); - } - if (i == 0) { - kh.get_spgemm_handle()->set_read_write_cost_calc( - params.calculate_read_write_cost); - } - - Kokkos::Timer timer1; - - row_mapC = - Kokkos::View("non_const_lnow_row", rowmap_size); - entriesC = lno_nnz_view_t(""); - valuesC = lno_nnz_view_t(""); - - double symbolic_time = 0; - if (params.triangle_options == 0) { - if (params.apply_compression) { - triangle_generic( - &kh, m, crsGraph.row_map, crsGraph.entries, - KOKKOS_LAMBDA(const lno_t &row, const lno_t &col_set_index, - const lno_t &col_set, const lno_t &thread_id) { - // row_mapC(row) += KokkosKernels::Impl::set_bit_count(col_set); - row_mapC(row) += KokkosKernels::Impl::pop_count(col_set); - }); - } else { - triangle_generic( - &kh, m, crsGraph.row_map, crsGraph.entries, - KOKKOS_LAMBDA(const lno_t &row, const lno_t &col_set_index, - const lno_t &col_set, const lno_t &thread_id) { - row_mapC(row) += 1; - // row_mapC(row) += KokkosKernels::Impl::set_bit_count(col_set); row_mapC(row) += - // KokkosKernels::Impl::pop_count(col_set); - }); - } - - size_t num_triangles = 0; - KokkosKernels::Impl::kk_reduce_view, - ExecSpace>(rowmap_size, row_mapC, - num_triangles); - ExecSpace().fence(); - - symbolic_time = timer1.seconds(); - std::cout << "num_triangles:" << num_triangles << std::endl; - } - kh.destroy_spgemm_handle(); - std::cout << "mm_time:" << symbolic_time << std::endl; - // only do this once - // kh.get_spgemm_handle()->set_read_write_cost_calc(false); - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 153382f111..a250fe15a7 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -14,15 +14,56 @@ // //@HEADER #include - -#include "KokkosGraph_multimem_triangle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosGraph_Triangle.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" //for read_kokkos_crst_graph +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestParameters.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +template +struct Flush { + typedef double value_type; + + // flush a large host buffer + Kokkos::View _buf; + Flush(int flush_option) : _buf("Flush::buf", BufSize) { + Kokkos::deep_copy(_buf, 1); + Kokkos::fence(); + if (flush_option == 2) { + for (size_t i = 0; i < BufSize; ++i) { + _buf(i) = rand(); + } + } + } + + KOKKOS_INLINE_FUNCTION + void init(value_type &update) { update = 0; } + + KOKKOS_INLINE_FUNCTION + void join(value_type &update, const value_type &input) { update += input; } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &update) const { update += _buf[i]; } + + void run() { + double sum = 0; + Kokkos::parallel_reduce( + "KokkosGraph::PerfTest::Flush", + Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, + sum); + SpaceType().fence(); + std::cout << "Flush sum:" << sum << std::endl; + FILE *fp = fopen("/dev/null", "w"); + fprintf(fp, "%f\n", sum); + fclose(fp); + } +}; void print_options() { std::cerr << "Options\n" << std::endl; - std::cerr << "Choose BackEnd : --openmp [numthreads] | " - "--cuda | --hip" - << std::endl; + std::cerr << perf_test::list_common_options(); std::cerr << "Input Matrix : --amtx [path_to_input_matrix]" << std::endl; @@ -32,13 +73,14 @@ void print_options() { std::cerr << "\t\t.bin: it will read binary crs matrix format." << std::endl; std::cerr << "\t\t.crs: it will read text crs matrix format." << std::endl; std::cerr << "--algorithm :" << std::endl; - std::cerr << "\tTRIANGLEAI: for Adj x Incidence" << std::endl; - std::cerr << "\tTRIANGLEIA: for Incidence x Adj -- implementing set " - "intersection (2D) -- 3rd fastest" - << std::endl; - std::cerr - << "\tTRIANGLEIAUNION: for Incidence x Adj -- implementing set union " - << std::endl; + // BMK 3-28-23: these algorithms do not give correct triangle counts + // std::cerr << "\tTRIANGLEAI: for Adj x Incidence" << std::endl; + // std::cerr << "\tTRIANGLEIA: for Incidence x Adj -- implementing set " + // "intersection (2D) -- 3rd fastest" + // << std::endl; + // std::cerr + // << "\tTRIANGLEIAUNION: for Incidence x Adj -- implementing set union " + // << std::endl; std::cerr << "\tTRIANGLELL: Lower x Lower -- usually fastest " << std::endl; std::cerr << "\tTRIANGLELU: Lower x Upper -- usually 2nd fastest " << std::endl; @@ -87,24 +129,17 @@ void print_options() { std::cerr << "Suggested use of LU: executable --amtx path_to_file.bin " "--algorithm TRIANGLELU --repeat 6 --verbose --chunksize [4|16]" << std::endl; - std::cerr - << "Suggested use of AI: executable --amtx path_to_file.bin --algorithm " - "TRIANGLEIA --repeat 6 --verbose --chunksize [4|16] rlt" - << std::endl; + // std::cerr + // << "Suggested use of AI: executable --amtx path_to_file.bin --algorithm + // " + // "TRIANGLEIA --repeat 6 --verbose --chunksize [4|16] rlt" + // << std::endl; } int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, char **argv) { for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { + if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { params.repeat = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--triangle_operation")) { @@ -117,44 +152,6 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, params.vector_size = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--compression")) { params.apply_compression = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--sort_option")) { - params.sort_option = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) { - int memspaces = atoi(argv[++i]); - int memspaceinfo = memspaces; - std::cout << "memspaceinfo:" << memspaceinfo << std::endl; - if (memspaceinfo & 1) { - params.a_mem_space = 1; - std::cout << "Using HBM for A" << std::endl; - } else { - params.a_mem_space = 0; - std::cout << "Using DDR4 for A" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.b_mem_space = 1; - std::cout << "Using HBM for B" << std::endl; - } else { - params.b_mem_space = 0; - std::cout << "Using DDR4 for B" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.c_mem_space = 1; - std::cout << "Using HBM for C" << std::endl; - } else { - params.c_mem_space = 0; - std::cout << "Using DDR4 for C" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.work_mem_space = 1; - std::cout << "Using HBM for work memory space" << std::endl; - } else { - params.work_mem_space = 0; - std::cout << "Using DDR4 for work memory space" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; } else if (0 == Test::string_compare_no_case(argv[i], "--flop")) { params.calculate_read_write_cost = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) { @@ -178,16 +175,7 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, params.check_output = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { params.a_mtx_bin_file = argv[++i]; - } - /* - else if ( 0 == Test::string_compare_no_case( argv[i] , "cmtx" ) ) { - params.c_mtx_bin_file = argv[++i]; - } - else if ( 0 == Test::string_compare_no_case( argv[i] , "bmtx" ) ) { - params.b_mtx_bin_file = argv[++i]; - } - */ - else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { params.use_dynamic_scheduling = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--cache_flush")) { params.cache_flush = atoi(argv[++i]); @@ -221,11 +209,20 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, ++i; if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEAI")) { params.algorithm = 16; + std::cerr << "\nAlgorithm TRIANGLEAI is disabled (produces incorrect " + "triangle count)\n"; + return 1; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEIA")) { params.algorithm = 17; + std::cerr << "\nAlgorithm TRIANGLEIA is disabled (produces incorrect " + "triangle count)\n"; + return 1; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEIAUNION")) { params.algorithm = 18; + std::cerr << "\nAlgorithm TRIANGLEIAUNION is disabled (produces " + "incorrect triangle count)\n"; + return 1; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLELL")) { params.algorithm = 19; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLELU")) { @@ -246,75 +243,162 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, return 0; } -int main(int argc, char **argv) { - typedef unsigned size_type; - typedef int idx; +template +void run_experiment(int argc, char **argv, perf_test::CommonInputParams) { + using namespace KokkosSparse; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using lno_t = default_lno_t; + using size_type = default_size_type; + using graph_t = + Kokkos::StaticCrsGraph; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, lno_t, exec_space, mem_space, mem_space>; + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + std::cerr + << "** Triangle counting is currently not supported on GPU backends.\n"; + return; + } KokkosKernels::Experiment::Parameters params; if (parse_inputs(params, argc, argv)) { - return 1; + return; } - if (params.a_mtx_bin_file == NULL) { - std::cerr << "Provide a matrix file" << std::endl; + if (params.a_mtx_bin_file == "") { + std::cerr << "Provide a graph file" << std::endl; print_options(); - return 0; + return; } - std::cout << "Sizeof(idx):" << sizeof(idx) + std::cout << "Sizeof(idx):" << sizeof(lno_t) << " sizeof(size_type):" << sizeof(size_type) << std::endl; - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads - const int device_id = 0; - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - -#if defined(KOKKOS_ENABLE_OPENMP) - - if (params.use_openmp) { - Kokkos::OpenMP().print_configuration(std::cout); -#ifdef KOKKOSKERNELS_MULTI_MEM - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, - Kokkos::HostSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, - Kokkos::OpenMP::memory_space>(params); -#endif - } + // read graph + graph_t crsGraph = KokkosSparse::Impl::read_kokkos_crst_graph( + params.a_mtx_bin_file.c_str()); -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if (params.use_cuda) { - Kokkos::Cuda().print_configuration(std::cout); -#ifdef KOKKOSKERNELS_MULTI_MEM - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::Cuda::memory_space>(params); -#endif - } + int algorithm = params.algorithm; + int repeat = params.repeat; + int chunk_size = params.chunk_size; + + int shmemsize = params.shmemsize; + int team_size = params.team_size; + int use_dynamic_scheduling = params.use_dynamic_scheduling; + int verbose = params.verbose; -#endif + int accumulator = params.accumulator; + int vector_size = params.vector_size; -#if defined(KOKKOS_ENABLE_HIP) - if (params.use_hip) { - Kokkos::Experimental::HIP().print_configuration(std::cout); - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + Kokkos::View row_mapC; + + KernelHandle kh; + kh.set_team_work_size(chunk_size); + kh.set_shmem_size(shmemsize); + kh.set_suggested_team_size(team_size); + kh.set_suggested_vector_size(vector_size); + + if (use_dynamic_scheduling) { + kh.set_dynamic_scheduling(true); } -#endif + if (verbose) { + kh.set_verbose(true); + } + const lno_t m = crsGraph.numRows(); - Kokkos::finalize(); + for (int i = 0; i < repeat; ++i) { + size_type rowmap_size = crsGraph.entries.extent(0); + switch (algorithm) { + case 16: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_AI); + rowmap_size = m; + break; + case 17: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); + std::cout << "IA" << std::endl; + break; + case 18: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA_UNION); break; + case 19: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LL); + rowmap_size = m; + break; + case 20: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LU); + rowmap_size = m; + break; + default: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); break; + } - return 0; + kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); + + kh.get_spgemm_handle()->set_sort_lower_triangular(params.right_sort); + kh.get_spgemm_handle()->set_create_lower_triangular( + params.right_lower_triangle); + kh.get_spgemm_handle()->set_compression(params.apply_compression); + kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); + + switch (accumulator) { + case 0: + default: + kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DEFAULT); + break; + case 1: + kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DENSE); + break; + case 2: + kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_SPARSE); + break; + } + + constexpr size_t LLC_CAPACITY = 128 * 1024 * 1024; + if (params.cache_flush) { + std::cout << "Flushing cache with option:" << params.cache_flush + << std::endl; + Flush flush(params.cache_flush); + flush.run(); + } + if (i == 0) { + kh.get_spgemm_handle()->set_read_write_cost_calc( + params.calculate_read_write_cost); + } + + Kokkos::Timer timer1; + + row_mapC = + Kokkos::View("non_const_lnow_row", rowmap_size); + + double symbolic_time = 0; + if (params.triangle_options == 0) { + if (params.apply_compression) { + KokkosGraph::Experimental::triangle_generic( + &kh, m, crsGraph.row_map, crsGraph.entries, + KOKKOS_LAMBDA(const lno_t &row, const lno_t & /* col_set_index */, + const lno_t &col_set, const lno_t & /* thread_id */) { + row_mapC(row) += KokkosKernels::Impl::pop_count(col_set); + }); + } else { + KokkosGraph::Experimental::triangle_generic( + &kh, m, crsGraph.row_map, crsGraph.entries, + KOKKOS_LAMBDA(const lno_t &row, const lno_t & /*col_set_index*/, + const lno_t & /*col_set*/, + const lno_t & /*thread_id*/) { row_mapC(row)++; }); + } + + size_t num_triangles = 0; + KokkosKernels::Impl::kk_reduce_view, + exec_space>(rowmap_size, row_mapC, + num_triangles); + symbolic_time = timer1.seconds(); + std::cout << "num_triangles:" << num_triangles << std::endl; + } + kh.destroy_spgemm_handle(); + std::cout << "mm_time:" << symbolic_time << std::endl; + } } + +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char **argv) { + return main_instantiation(argc, argv); +} // main diff --git a/perf_test/ode/CMakeLists.txt b/perf_test/ode/CMakeLists.txt new file mode 100644 index 0000000000..b4aa86889f --- /dev/null +++ b/perf_test/ode/CMakeLists.txt @@ -0,0 +1,8 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +if(KOKKOSKERNELS_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + ode_runge_kutta SOURCES KokkosODE_RK.cpp + ) +endif() diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp new file mode 100644 index 0000000000..e9dc3f2f8e --- /dev/null +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -0,0 +1,365 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosODE_RungeKutta.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include "Benchmark_Context.hpp" + +namespace { +// R1 = 1e-6*1.85e10 * exp(-15618 / T) * (reac) ( 1 – (1- 10^-9) reac) +// d(reac)/dt = -R1 +// d(prod)/dt = R1 +struct chem_model_1 { + constexpr static int neqs = 2; + // constexpr static double alpha = 1e-6*1.85e10; + constexpr static double alpha = 1.85e10; + constexpr static double beta = 15618; + constexpr static double gamma = 1 - 10e-9; + + const double tstart, tend, T0, T1; + + chem_model_1(const double tstart_ = 0, const double tend_ = 300, + const double T0_ = 300, const double T1_ = 800) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = (T1 - T0) * (t - tstart) / (tend - tstart) + T0; + + // Evaluate the chemical reaction rate + f(0) = -alpha * Kokkos::exp(-beta / T) * y(0) * (1 - gamma * y(0)); + f(1) = -f(0); + } +}; + +// More complex chemical reaction involving two reacting +// species foam A and foam B, that become 5 products. +// The temperature is capped at 1000K once t reaches 1500s +struct chem_model_2 { + constexpr static int neqs = 7; + constexpr static double alpha1 = 1e-6 * 3334169440721739.0 * 1500; + constexpr static double beta1 = 207850000.0 / 8314.0; + constexpr static double alpha2 = 1e-6 * 49997793980831.89 * 1500; + constexpr static double beta2 = 207850000.0 / 8314.0; + + const double tstart, tend, T0, T1; + + chem_model_2(const double tstart_ = 0, const double tend_ = 2000, + const double T0_ = 300, const double T1_ = 1000) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = ((T1 - T0) * (t - tstart) / (1500 - tstart) + T0 < 1000) + ? (T1 - T0) * (t - tstart) / (1500 - tstart) + T0 + : 1000; + + // Evaluate the chemical reaction rates + double R1 = y(0) * alpha1 * Kokkos::exp(-beta1 / T); + double R2 = y(1) * alpha2 * Kokkos::exp(-beta2 / T); + + // Evaluate the chemical reaction rate + f(0) = -R1; + f(1) = -R2; + f(2) = R1 + 0.08 * R2; + f(3) = 0.147 * R2; + f(4) = 0.453 * R2; + f(5) = 0.187 * R2; + f(6) = 0.133 * R2; + } +}; + +template +struct RKSolve_wrapper { + using ode_params = KokkosODE::Experimental::ODE_params; + + ode_type my_ode; + table_type table; + ode_params params; + + scalar_type tstart, tend; + vec_type y_old, y_new, tmp; + mv_type kstack; + + RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, + const ode_params& params_, const scalar_type tstart_, + const scalar_type tend_, const vec_type& y_old_, + const vec_type& y_new_, const vec_type& tmp_, + const mv_type& kstack_) + : my_ode(my_ode_), + table(table_), + params(params_), + tstart(tstart_), + tend(tend_), + y_old(y_old_), + y_new(y_new_), + tmp(tmp_), + kstack(kstack_) {} + + KOKKOS_FUNCTION + void operator()(const int idx) const { + // Take subviews to create the local problem + auto local_y_old = + Kokkos::subview(y_old, Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_y_new = + Kokkos::subview(y_new, Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_tmp = Kokkos::subview(tmp, Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_kstack = Kokkos::subview( + kstack, Kokkos::pair(2 * idx, 2 * idx + 1), Kokkos::ALL()); + + // Run Runge-Kutta time integrator + KokkosODE::Impl::RKSolve( + my_ode, table, params, tstart, tend, local_y_old, local_y_new, + local_tmp, local_kstack); + } +}; + +struct rk_input_parameters { + int num_odes; + int model; + int repeat; + bool verbose; + + rk_input_parameters(const int num_odes_, const int model_, const int repeat_, + const bool verbose_) + : num_odes(num_odes_), + model(model_), + repeat(repeat_), + verbose(verbose_){}; +}; + +} // namespace + +template +void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; + using ode_params = KokkosODE::Experimental::ODE_params; + + const int num_odes = inputs.num_odes; + const int model = inputs.model; + + switch (model) { + case 1: { + chem_model_1 chem_model; + const int neqs = chem_model.neqs; + const int num_steps = 15000; + const double dt = 0.1; + + table_type table; + ode_params params(num_steps); + vec_type tmp("tmp vector", neqs * num_odes); + mv_type kstack("k stack", neqs * num_odes, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs * num_odes); + vec_type y_old("initial conditions", neqs * num_odes); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, num_odes); + RKSolve_wrapper solve_wrapper(chem_model, table, params, + chem_model.tstart, chem_model.tend, y_old, + y_new, tmp, kstack); + + Kokkos::Timer time; + time.reset(); + for (auto _ : state) { + (void)_; + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + + if (inputs.verbose) { + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "\nChem model 1" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" + << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; + std::cout << " num odes: " << num_odes << std::endl; + std::cout << " time elapsed: " << run_time << std::endl; + } + break; + } + case 2: { + chem_model_2 chem_model; + const int neqs = chem_model.neqs; + const int num_steps = 15000; + const double dt = 0.1; + + table_type table; + ode_params params(num_steps); + vec_type tmp("tmp vector", neqs * num_odes); + mv_type kstack("k stack", neqs * num_odes, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs * num_odes); + vec_type y_old("initial conditions", neqs * num_odes); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 0.25; + y_old_h(1) = 0.25; + y_old_h(2) = 0; + y_old_h(3) = 0; + y_old_h(4) = 0; + y_old_h(5) = 0; + y_old_h(6) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, num_odes); + RKSolve_wrapper solve_wrapper(chem_model, table, params, + chem_model.tstart, chem_model.tend, y_old, + y_new, tmp, kstack); + + Kokkos::Timer time; + time.reset(); + for (auto _ : state) { + (void)_; + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + + if (inputs.verbose) { + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "\nChem model 2" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" + << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; + std::cout << " num odes: " << num_odes << std::endl; + std::cout << " time elapsed: " << run_time << std::endl; + } + break; + } + } +} + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --repeat :: how many times to repeat overall test" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\t[Optional] --n :: number of ode problems to solve" + << std::endl; + std::cerr + << "\t[Optional] --model :: chemical mode to be solved: 1 or 2" + << std::endl; +} // print_options + +int parse_inputs(rk_input_parameters& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--n", params.num_odes)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--model", + params.model)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_benchmark_wrapper(benchmark::State& state, int argc, char** argv) { + rk_input_parameters params(state.range(0), state.range(1), 1, false); + parse_inputs(params, argc, argv); + run_ode_chem(state, params); +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + std::string bench_name = "KokkosODE_chem_models"; + + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, argc, argv) + ->UseRealTime() + ->ArgNames({"n", "model"}) + ->Args({1000, 1}) + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, argc, argv) + ->UseRealTime() + ->ArgNames({"n", "model"}) + ->Args({1000, 1}); + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + + return 0; +} diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 6eac716aca..263f59671a 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -115,3 +115,24 @@ KOKKOSKERNELS_ADD_EXECUTABLE( sparse_mdf SOURCES KokkosSparse_mdf.cpp ) + +if (KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_par_ilut + SOURCES KokkosSparse_par_ilut.cpp + ) + + # Provide -DGinkgo_DIR to cmake to enable the ginkgo test in sparse_par_ilut. Ginkgo_DIR should + # point to the dir in the ginkgo install area that contains the GinkgoConfig.cmake file. + # For me, this was $gingko_install_dir/lib64/cmake/Ginkgo + if (Ginkgo_DIR) + find_package(Ginkgo REQUIRED) + + target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") + target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) + endif() + + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_spmv_benchmark SOURCES KokkosSparse_spmv_benchmark.cpp + ) +endif() diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp index ca48df8fd2..319a43ae11 100644 --- a/perf_test/sparse/KokkosSparse_mdf.cpp +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -19,15 +19,14 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" -#include "KokkosSparse_mdf.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include "KokkosSparse_mdf.hpp" -struct Params { - int use_cuda = 0; - int use_hip = 0; - int use_sycl = 0; - int use_openmp = 0; - int use_threads = 0; +using perf_test::CommonInputParams; + +struct LocalParams { std::string amtx; int m = 10000; int n = 10000; @@ -54,8 +53,61 @@ struct diag_generator_functor { } }; -template -void run_experiment(const Params& params) { +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "MDF" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A matrix" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" + << std::endl; +} // print_options + +int parse_inputs(LocalParams& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--nnz", + params.nnzPerRow)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--diag", + params.diag)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_experiment(int argc, char** argv, CommonInputParams /*params*/) { + using crsMat_t = + KokkosSparse::CrsMatrix; using size_type = typename crsMat_t::size_type; using lno_t = typename crsMat_t::ordinal_type; using scalar_t = typename crsMat_t::value_type; @@ -67,19 +119,22 @@ void run_experiment(const Params& params) { using entries_t = typename graph_t::entries_type::non_const_type; using values_t = typename crsMat_t::values_type::non_const_type; + LocalParams localParams; + parse_inputs(localParams, argc, argv); + std::cout << "************************************* \n"; std::cout << "************************************* \n"; crsMat_t A; - lno_t m = params.m; - lno_t n = params.n; - if (params.amtx.length()) { - std::cout << "Loading A from " << params.amtx << '\n'; + lno_t m = localParams.m; + lno_t n = localParams.n; + if (localParams.amtx.length()) { + std::cout << "Loading A from " << localParams.amtx << '\n'; A = KokkosSparse::Impl::read_kokkos_crst_matrix( - params.amtx.c_str()); + localParams.amtx.c_str()); m = A.numRows(); n = A.numCols(); } else { - if (params.diag) { + if (localParams.diag) { std::cout << "Randomly generating diag matrix\n"; rowmap_t rowmapA("A row map", m + 1); entries_t entriesA("A entries", m); @@ -100,13 +155,13 @@ void run_experiment(const Params& params) { A = crsMat_t("A matrix", m, valuesA, graph); } else { std::cout << "Randomly generating matrix\n"; - size_type nnzUnused = m * params.nnzPerRow; + size_type nnzUnused = m * localParams.nnzPerRow; A = KokkosSparse::Impl::kk_generate_sparse_matrix( m, n, nnzUnused, 0, (n + 3) / 3); } } - if (params.verbose) { + if (localParams.verbose) { std::cout << "Matrix A" << std::endl; std::cout << " row_map A:" << std::endl; KokkosKernels::Impl::print_1Dview(A.graph.row_map); @@ -125,9 +180,12 @@ void run_experiment(const Params& params) { timer.reset(); KokkosSparse::Experimental::MDF_handle handle(A); handle.set_verbosity(0); + if (localParams.verbose) { + handle.set_verbosity(1); + } handleTime += timer.seconds(); - for (int sumRep = 0; sumRep < params.repeat; sumRep++) { + for (int sumRep = 0; sumRep < localParams.repeat; sumRep++) { timer.reset(); KokkosSparse::Experimental::mdf_symbolic(A, handle); Kokkos::fence(); @@ -140,16 +198,16 @@ void run_experiment(const Params& params) { } std::cout << "Mean total time: " - << handleTime + (symbolicTime / params.repeat) + - (numericTime / params.repeat) + << handleTime + (symbolicTime / localParams.repeat) + + (numericTime / localParams.repeat) << std::endl << "Handle time: " << handleTime << std::endl - << "Mean symbolic time: " << (symbolicTime / params.repeat) + << "Mean symbolic time: " << (symbolicTime / localParams.repeat) << std::endl - << "Mean numeric time: " << (numericTime / params.repeat) + << "Mean numeric time: " << (numericTime / localParams.repeat) << std::endl; - if (params.verbose) { + if (localParams.verbose) { entries_t permutation = handle.get_permutation(); std::cout << "MDF permutation:" << std::endl; @@ -157,164 +215,8 @@ void run_experiment(const Params& params) { } } // run_experiment -void print_options() { - std::cerr << "Options\n" << std::endl; - - std::cerr - << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" - " | '--sycl [syclDeviceIndex]'" - << std::endl; - - std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; - std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " - "MDF" - << std::endl; - std::cerr << "\t[Optional] --verbose :: enable verbose output" - << std::endl; - std::cerr << "\nSettings for randomly generated A matrix" << std::endl; - std::cerr << "\t[Optional] --m :: number of rows to generate" - << std::endl; - std::cerr << "\t[Optional] --n :: number of cols to generate" - << std::endl; - std::cerr - << "\t[Optional] --nnz :: number of entries per row to generate" - << std::endl; - std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" - << std::endl; -} // print_options - -int parse_inputs(Params& params, int argc, char** argv) { - for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { - params.use_sycl = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { - params.amtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { - params.m = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { - params.n = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { - params.nnzPerRow = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--diag")) { - params.diag = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - params.repeat = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { - params.verbose = true; - } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; - print_options(); - return 1; - } - } - return 0; -} // parse_inputs - +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" int main(int argc, char** argv) { - Params params; - - if (parse_inputs(params, argc, argv)) { - return 1; - } - const int num_threads = - std::max(params.use_openmp, - params.use_threads); // Assumption is that use_openmp variable - // is provided as number of threads - - // If cuda, hip or sycl is used, set device_id - int device_id = 0; - if (params.use_cuda > 0) { - device_id = params.use_cuda - 1; - } - if (params.use_hip > 0) { - device_id = params.use_hip - 1; - } - if (params.use_sycl > 0) { - device_id = params.use_sycl - 1; - } - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - - bool useOMP = params.use_openmp != 0; - bool useThreads = params.use_threads != 0; - bool useCUDA = params.use_cuda != 0; - bool useHIP = params.use_hip != 0; - bool useSYCL = params.use_sycl != 0; - bool useSerial = !useOMP && !useCUDA && !useHIP && !useSYCL; - - if (useOMP) { -#if defined(KOKKOS_ENABLE_OPENMP) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: OpenMP requested, but not available.\n"; - return 1; -#endif - } - if (useThreads) { -#if defined(KOKKOS_ENABLE_THREADS) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: OpenMP requested, but not available.\n"; - return 1; -#endif - } - if (useCUDA) { -#if defined(KOKKOS_ENABLE_CUDA) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: CUDA requested, but not available.\n"; - return 1; -#endif - } - if (useHIP) { -#if defined(KOKKOS_ENABLE_HIP) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: HIP requested, but not available.\n"; - return 1; -#endif - } - if (useSYCL) { -#if defined(KOKKOS_ENABLE_SYCL) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: SYCL requested, but not available.\n"; - return 1; -#endif - } - if (useSerial) { -#if defined(KOKKOS_ENABLE_SERIAL) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: Serial device requested, but not available.\n"; - return 1; -#endif - } - Kokkos::finalize(); - return 0; + return main_instantiation(argc, argv); } // main diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp deleted file mode 100644 index 269baf3fdc..0000000000 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ /dev/null @@ -1,216 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosSparse_run_spgemm.hpp" -#include "KokkosSparse_IOUtils.hpp" - -namespace KokkosKernels { - -namespace Experiment { - -template -void run_multi_mem_spgemm(Parameters params) { - typedef exec_space myExecSpace; - typedef Kokkos::Device myFastDevice; - typedef Kokkos::Device mySlowExecSpace; - - typedef typename KokkosSparse::CrsMatrix - fast_crstmat_t; - typedef typename KokkosSparse::CrsMatrix - slow_crstmat_t; - - char *a_mat_file = params.a_mtx_bin_file; - char *b_mat_file = params.b_mtx_bin_file; - char *c_mat_file = params.c_mtx_bin_file; - - slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat; - fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat; - - // read a and b matrices and store them on slow or fast memory. - - if (params.a_mem_space == 1) { - a_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); - } else { - a_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); - } - - if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && - params.b_mem_space == params.a_mem_space) { - std::cout << "Using A matrix for B as well" << std::endl; - b_fast_crsmat = a_fast_crsmat; - b_slow_crsmat = a_slow_crsmat; - } else if (params.b_mem_space == 1) { - if (b_mat_file == NULL) b_mat_file = a_mat_file; - b_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); - } else { - if (b_mat_file == NULL) b_mat_file = a_mat_file; - b_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); - } - - if (params.a_mem_space == 1) { - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } - } - } - } else { - // A is in slow memory - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } - } - } - } - - if (c_mat_file != NULL) { - if (params.c_mem_space == 1) { - KokkosSparse::sort_crs_matrix(c_fast_crsmat); - - KokkosSparse::Impl::write_graph_bin( - (lno_t)(c_fast_crsmat.numRows()), - (size_type)(c_fast_crsmat.graph.entries.extent(0)), - c_fast_crsmat.graph.row_map.data(), - c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), - c_mat_file); - } else { - KokkosSparse::sort_crs_matrix(c_slow_crsmat); - - KokkosSparse::Impl::write_graph_bin( - (lno_t)c_slow_crsmat.numRows(), - (size_type)c_slow_crsmat.graph.entries.extent(0), - c_slow_crsmat.graph.row_map.data(), - c_slow_crsmat.graph.entries.data(), c_slow_crsmat.values.data(), - c_mat_file); - } - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp new file mode 100644 index 0000000000..ef144f2817 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -0,0 +1,479 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include +#include +#include +#include +#include // std::setprecision + +#include + +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spiluk.hpp" +#include "KokkosSparse_par_ilut.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_default_types.hpp" +#include +#include +#include "KokkosKernels_perf_test_utilities.hpp" + +#include "Benchmark_Context.hpp" +#include + +#ifdef USE_GINKGO +#include +#endif + +namespace { + +using KokkosSparse::Experimental::par_ilut_numeric; +using KokkosSparse::Experimental::par_ilut_symbolic; + +using KokkosSparse::Experimental::spiluk_numeric; +using KokkosSparse::Experimental::spiluk_symbolic; +using KokkosSparse::Experimental::SPILUKAlgorithm; + +// Build up useful types +using scalar_t = default_scalar; +using lno_t = default_lno_t; +using size_type = default_size_type; +using exe_space = Kokkos::DefaultExecutionSpace; +using mem_space = typename exe_space::memory_space; +using device = Kokkos::Device; + +using RowMapType = Kokkos::View; +using EntriesType = Kokkos::View; +using ValuesType = Kokkos::View; + +using sp_matrix_type = + KokkosSparse::CrsMatrix; +using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; +using float_t = typename Kokkos::ArithTraits::mag_type; + +static constexpr bool IS_GPU = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + +/////////////////////////////////////////////////////////////////////////////// +void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, + const sp_matrix_type& A, int& num_iters) +/////////////////////////////////////////////////////////////////////////////// +{ + const int rows = state.range(0); + + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Pull out views from CRS + auto A_row_map = A.graph.row_map; + auto A_entries = A.graph.entries; + auto A_values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", rows + 1); + RowMapType U_row_map("U_row_map", rows + 1); + + // Initial L/U approximations for A + EntriesType L_entries("L_entries", 0); + ValuesType L_values("L_values", 0); + EntriesType U_entries("U_entries", 0); + ValuesType U_values("U_values", 0); + + for (auto _ : state) { + state.ResumeTiming(); + par_ilut_symbolic(&kh, A_row_map, A_entries, L_row_map, U_row_map); + + size_type nnzL = par_ilut_handle->get_nnzL(); + size_type nnzU = par_ilut_handle->get_nnzU(); + + Kokkos::resize(L_entries, nnzL); + Kokkos::resize(U_entries, nnzU); + Kokkos::resize(L_values, nnzL); + Kokkos::resize(U_values, nnzU); + Kokkos::deep_copy(L_entries, 0); + Kokkos::deep_copy(U_entries, 0); + Kokkos::deep_copy(L_values, 0); + Kokkos::deep_copy(U_values, 0); + + par_ilut_numeric(&kh, A_row_map, A_entries, A_values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values); + Kokkos::fence(); + state.PauseTiming(); + + // Check worked + num_iters = par_ilut_handle->get_num_iters(); + KK_REQUIRE_MSG(num_iters < par_ilut_handle->get_max_iter(), + "par_ilut hit max iters"); + + // Reset inputs + Kokkos::deep_copy(L_row_map, 0); + Kokkos::deep_copy(U_row_map, 0); + } +} + +#ifdef USE_GINKGO +/////////////////////////////////////////////////////////////////////////////// +using ginkgo_exec = + std::conditional_t; + +template +std::shared_ptr get_ginkgo_exec() { + return GinkgoT::create(); +} + +#ifdef KOKKOS_ENABLE_CUDA +template <> +std::shared_ptr get_ginkgo_exec() { + auto ref_exec = gko::ReferenceExecutor::create(); + return gko::CudaExecutor::create(0 /*device id*/, ref_exec); +} +#endif + +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh, + const sp_matrix_type& A, const int& num_iters) +/////////////////////////////////////////////////////////////////////////////// +{ + const int rows = state.range(0); + + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Pull out views from CRS + auto A_row_map = A.graph.row_map; + auto A_entries = A.graph.entries; + auto A_values = A.values; + + using mtx = gko::matrix::Csr; + + auto exec = get_ginkgo_exec(); + + // ginkgo does not differentiate between index type and size type. We need + // to convert A_row_map to lno_t. + EntriesType A_row_map_cp("A_row_map_cp", rows + 1); + Kokkos::deep_copy(A_row_map_cp, A_row_map); + + // Populate mtx + auto a_mtx_uniq = + mtx::create_const(exec, gko::dim<2>(rows, rows), + gko::array::const_view( + exec, A_values.extent(0), A_values.data()), + gko::array::const_view(exec, A_entries.extent(0), + A_entries.data()), + gko::array::const_view( + exec, A_row_map_cp.extent(0), A_row_map_cp.data())); + + std::shared_ptr a_mtx = std::move(a_mtx_uniq); + + for (auto _ : state) { + auto fact = gko::factorization::ParIlut::build() + .with_fill_in_limit(par_ilut_handle->get_fill_in_limit()) + .with_approximate_select(false) + .with_iterations(num_iters) + .on(exec) + ->generate(a_mtx); + } +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +void run_spiluk_test(benchmark::State& state, KernelHandle& kh, + const sp_matrix_type& A, const int& team_size, + const bool measure_symbolic) +/////////////////////////////////////////////////////////////////////////////// +{ + const int rows = state.range(0); + + constexpr int EXPAND_FACT = 10; + const lno_t fill_lev = 2; + const size_type handle_nnz = EXPAND_FACT * A.nnz() * (fill_lev + 1); + kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, rows, handle_nnz, + handle_nnz); + auto spiluk_handle = kh.get_spiluk_handle(); + spiluk_handle->set_team_size(team_size); + + // Pull out views from CRS + auto A_row_map = A.graph.row_map; + auto A_entries = A.graph.entries; + auto A_values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", rows + 1); + RowMapType U_row_map("U_row_map", rows + 1); + + // Initial L/U approximations for A + EntriesType L_entries("L_entries", handle_nnz); + ValuesType L_values("L_values", handle_nnz); + EntriesType U_entries("U_entries", handle_nnz); + ValuesType U_values("U_values", handle_nnz); + + for (auto _ : state) { + if (measure_symbolic) { + state.ResumeTiming(); + } + spiluk_symbolic(&kh, fill_lev, A_row_map, A_entries, L_row_map, L_entries, + U_row_map, U_entries); + Kokkos::fence(); + state.PauseTiming(); + + const size_type nnzL = spiluk_handle->get_nnzL(); + const size_type nnzU = spiluk_handle->get_nnzU(); + + Kokkos::resize(L_entries, nnzL); + Kokkos::resize(U_entries, nnzU); + Kokkos::resize(L_values, nnzL); + Kokkos::resize(U_values, nnzU); + + if (!measure_symbolic) { + state.ResumeTiming(); + spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values); + Kokkos::fence(); + state.PauseTiming(); + } + + // Reset inputs + Kokkos::deep_copy(L_row_map, 0); + Kokkos::deep_copy(U_row_map, 0); + Kokkos::deep_copy(L_entries, 0); + Kokkos::deep_copy(U_entries, 0); + Kokkos::deep_copy(L_values, 0); + Kokkos::deep_copy(U_values, 0); + Kokkos::resize(L_entries, handle_nnz); + Kokkos::resize(U_entries, handle_nnz); + + spiluk_handle->reset_handle(rows, handle_nnz, handle_nnz); + } +} + +/////////////////////////////////////////////////////////////////////////////// +int test_par_ilut_perf(const std::string& matrix_file, int rows, + int nnz_per_row, const int bandwidth, int team_size, + const int loop, const int test) +/////////////////////////////////////////////////////////////////////////////// +{ + KernelHandle kh; + kh.create_par_ilut_handle(); + + // Generate or read A + sp_matrix_type A; + if (matrix_file == "") { + size_type nnz = rows * nnz_per_row; + const lno_t row_size_variance = 0; + const scalar_t diag_dominance = 1; + A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + sp_matrix_type>(rows, rows, nnz, row_size_variance, bandwidth, + diag_dominance); + } else { + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + matrix_file.c_str()); + rows = A.numRows(); + nnz_per_row = A.nnz() / rows; + } + + // Now that we have A, we can set team_size + if (team_size == -1) { + team_size = KokkosKernels::Impl::kk_is_gpu_exec_space() + ? nnz_per_row + : 1; + } + + KokkosSparse::sort_crs_matrix(A); + + // Make handles + auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_team_size(team_size); + par_ilut_handle->set_nrows(rows); + + const auto default_policy = par_ilut_handle->get_default_team_policy(); + + // Report test config to user + if (matrix_file == "") { + std::cout << "Testing par_ilut with rows=" << rows + << "\n nnz_per_row=" << nnz_per_row + << "\n bandwidth=" << bandwidth; + } else { + std::cout << "Testing par_ilut with input matrix=" << matrix_file; + } + std::cout << "\n total nnz=" << A.nnz() + << "\n league_size=" << default_policy.league_size() + << "\n team_size=" << default_policy.team_size() + << "\n concurrent teams=" + << exe_space().concurrency() / default_policy.team_size() + << "\n loop=" << loop << std::endl; + + std::string name = "KokkosSparse_par_ilut"; + int num_iters = 6; + const auto arg_names = std::vector{"rows"}; + const auto args = std::vector{rows}; + + if (test & 1) { + auto plambda = [&](benchmark::State& state) { + run_par_ilut_test(state, kh, A, num_iters); + }; + KokkosKernelsBenchmark::register_benchmark_real_time( + (name + "_par_ilut").c_str(), plambda, arg_names, args, loop); + } + +#ifdef USE_GINKGO + if (test & 2) { + auto glambda = [&](benchmark::State& state) { + run_par_ilut_test_ginkgo(state, kh, A, num_iters); + }; + KokkosKernelsBenchmark::register_benchmark_real_time( + (name + "_gingko").c_str(), glambda, arg_names, args, loop); + } +#endif + + if (test & 4) { + auto s1lambda = [&](benchmark::State& state) { + run_spiluk_test(state, kh, A, team_size, true); + }; + auto s2lambda = [&](benchmark::State& state) { + run_spiluk_test(state, kh, A, team_size, false); + }; + KokkosKernelsBenchmark::register_benchmark_real_time( + (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop); + + KokkosKernelsBenchmark::register_benchmark_real_time( + (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop); + } + + // Need to run before vars used by lambdas go out of scope + benchmark::RunSpecifiedBenchmarks(); + + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +void print_help_par_ilut() +/////////////////////////////////////////////////////////////////////////////// +{ + printf("Options:\n"); + printf(" -f [F] : Read in Matrix Market formatted text file.\n"); + printf(" -n [N] : generate a semi-random banded NxN matrix.\n"); + printf(" -z [Z] : number nnz per row. Default is min(1%% of N, 50).\n"); + printf(" -b [B] : bandwidth per row. Default is max(2 * n^(1/2), nnz).\n"); + printf( + " -ts [T] : Number of threads per team. Default is 1 on OpenMP, " + "nnz_per_row on CUDA\n"); + // printf(" -vl [V] : Vector-length (i.e. how many Cuda threads are a Kokkos + // 'thread').\n"); + printf( + " -l [L] : How many runs to aggregate average time. Default is 4\n\n"); + printf( + " -t [T] : Which tests to run. Bitwise. e.g. 7 => run all, 1 => " + "par_ilut, 2 => ginkgo, 4 => spiluk,. Default is 7\n\n"); +} + +/////////////////////////////////////////////////////////////////////////////// +void handle_int_arg(int argc, char** argv, int& i, + std::map option_map) +/////////////////////////////////////////////////////////////////////////////// +{ + std::string arg = argv[i]; + auto it = option_map.find(arg); + KK_USER_REQUIRE_MSG(it != option_map.end(), "Unknown option: " << arg); + KK_USER_REQUIRE_MSG(i + 1 < argc, "Missing option value for option: " << arg); + *(it->second) = atoi(argv[++i]); +} + +} // namespace + +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char** argv) +/////////////////////////////////////////////////////////////////////////////// +{ + std::string mfile = ""; + int rows = -1; + int nnz_per_row = + -1; // depends on other options, so don't set to default yet + int bandwidth = -1; + int team_size = -1; + int test = 7; + + std::map option_map = {{"-n", &rows}, + {"-z", &nnz_per_row}, + {"-b", &bandwidth}, + {"-ts", &team_size}, + {"-t", &test}}; + + if (argc == 1) { + print_help_par_ilut(); + return 0; + } + + // Handle common params + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + // Handle user options + for (int i = 1; i < argc; i++) { + if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { + print_help_par_ilut(); + return 0; + } else if ((strcmp(argv[i], "-f") == 0)) { + mfile = argv[++i]; + } else { + handle_int_arg(argc, argv, i, option_map); + } + } + + // Determine where A is coming from + if (rows != -1) { + // We are randomly generating the input A + KK_USER_REQUIRE_MSG(rows >= 100, "Need to have at least 100 rows"); + + KK_USER_REQUIRE_MSG( + mfile == "", + "Need provide either -n or -f argument to this program, not both"); + } else { + // We are reading A from a file + KK_USER_REQUIRE_MSG( + mfile != "", + "Need provide either -n or -f argument to this program, not both"); + } + + // Set dependent defaults. Default team_size cannot be set + // until we know more about A + if (nnz_per_row == -1) { + nnz_per_row = std::min(rows / 100, 50); + } + if (bandwidth == -1) { + bandwidth = std::max(2 * (int)std::sqrt(rows), 2 * nnz_per_row); + } + + Kokkos::initialize(argc, argv); + { + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + test_par_ilut_perf(mfile, rows, nnz_per_row, bandwidth, team_size, + common_params.repeat, test); + + benchmark::Shutdown(); + } + Kokkos::finalize(); + return 0; +} diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp deleted file mode 100644 index 67d61d1f75..0000000000 --- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp +++ /dev/null @@ -1,301 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosSparse_spgemm.hpp" -#include "KokkosKernels_TestParameters.hpp" -#include "KokkosSparse_SortCrs.hpp" - -#define TRANPOSEFIRST false -#define TRANPOSESECOND false - -namespace KokkosKernels { - -namespace Experiment { -template -bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - size_t nrows1 = output_mat1.graph.row_map.extent(0); - size_t nentries1 = output_mat1.graph.entries.extent(0); - size_t nvals1 = output_mat1.values.extent(0); - - size_t nrows2 = output_mat2.graph.row_map.extent(0); - size_t nentries2 = output_mat2.graph.entries.extent(0); - size_t nvals2 = output_mat2.values.extent(0); - - KokkosSparse::sort_crs_matrix(output_mat1); - - if (nrows1 != nrows2) { - std::cerr << "row count is different" << std::endl; - return false; - } - if (nentries1 != nentries2) { - std::cerr << "nentries2 is different" << std::endl; - return false; - } - if (nvals1 != nvals2) { - std::cerr << "nvals1 is different" << std::endl; - return false; - } - - KokkosSparse::sort_crs_matrix(output_mat2); - - bool is_identical = true; - is_identical = KokkosKernels::Impl::kk_is_identical_view< - typename graph_t::row_map_type, typename graph_t::row_map_type, - typename lno_view_t::value_type, typename device::execution_space>( - output_mat1.graph.row_map, output_mat2.graph.row_map, 0); - if (!is_identical) { - std::cerr << "rowmaps differ" << std::endl; - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, - typename device::execution_space>(output_mat1.graph.entries, - output_mat2.graph.entries, 0); - if (!is_identical) { - for (size_t i = 0; i < nrows1; ++i) { - size_t rb = output_mat1.graph.row_map(i); - size_t re = output_mat1.graph.row_map(i + 1); - bool incorrect = false; - for (size_t j = rb; j < re; ++j) { - if (output_mat1.graph.entries(j) != output_mat2.graph.entries(j)) { - incorrect = true; - break; - } - } - if (incorrect) { - for (size_t j = rb; j < re; ++j) { - std::cerr << "row:" << i << " j:" << j - << " h_ent1(j):" << output_mat1.graph.entries(j) - << " h_ent2(j):" << output_mat2.graph.entries(j) - << " rb:" << rb << " re:" << re << std::endl; - } - } - } - std::cerr << "entries differ" << std::endl; - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - scalar_view_t, scalar_view_t, typename scalar_view_t::value_type, - typename device::execution_space>(output_mat1.values, output_mat2.values, - 0.000001); - if (!is_identical) { - std::cerr << "Incorret values" << std::endl; - } - return true; -} - -template -crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, - Parameters params) { - using namespace KokkosSparse; - using namespace KokkosSparse::Experimental; - using device_t = Kokkos::Device; - int algorithm = params.algorithm; - int repeat = params.repeat; - int chunk_size = params.chunk_size; - - int shmemsize = params.shmemsize; - int team_size = params.team_size; - int use_dynamic_scheduling = params.use_dynamic_scheduling; - int verbose = params.verbose; - int calculate_read_write_cost = params.calculate_read_write_cost; - // char spgemm_step = params.spgemm_step; - int vector_size = params.vector_size; - int check_output = params.check_output; - int mkl_keep_output = params.mkl_keep_output; - // spgemm_step++; - typedef typename crsMat_t3::values_type::non_const_type scalar_view_t; - typedef typename crsMat_t3::row_map_type::non_const_type lno_view_t; - typedef typename crsMat_t3::index_type::non_const_type lno_nnz_view_t; - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename lno_view_t::value_type size_type; - typedef typename scalar_view_t::value_type scalar_t; - - lno_view_t row_mapC; - lno_nnz_view_t entriesC; - scalar_view_t valuesC; - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, ExecSpace, TempMemSpace, PersistentMemSpace> - KernelHandle; - - typedef typename lno_nnz_view_t::value_type idx; - typedef typename lno_view_t::value_type size_type; - - KernelHandle kh; - kh.set_team_work_size(chunk_size); - kh.set_shmem_size(shmemsize); - kh.set_suggested_team_size(team_size); - kh.set_suggested_vector_size(vector_size); - - if (use_dynamic_scheduling) { - kh.set_dynamic_scheduling(true); - } - if (verbose) { - kh.set_verbose(true); - } - - const idx m = crsMat.numRows(); - const idx n = crsMat2.numRows(); - const idx k = crsMat2.numCols(); - - if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl; - if (n < crsMat.numCols()) { - std::cerr << "left.numCols():" << crsMat.numCols() - << " right.numRows():" << crsMat2.numRows() << std::endl; - exit(1); - } - - // The reference product (for verifying correctness) - // Don't allocate them if they won't be used, but they must be declared here. - lno_view_t row_mapC_ref; - lno_nnz_view_t entriesC_ref; - scalar_view_t valuesC_ref; - // Reference output has same type as actual output - crsMat_t3 Ccrsmat_ref; - - if (check_output) { - if (verbose) std::cout << "Running a reference algorithm" << std::endl; - row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1); - KernelHandle sequential_kh; - sequential_kh.set_team_work_size(chunk_size); - sequential_kh.set_shmem_size(shmemsize); - sequential_kh.set_suggested_team_size(team_size); - sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL); - - if (use_dynamic_scheduling) { - sequential_kh.set_dynamic_scheduling(true); - } - - spgemm_symbolic(&sequential_kh, m, n, k, crsMat.graph.row_map, - crsMat.graph.entries, TRANPOSEFIRST, crsMat2.graph.row_map, - crsMat2.graph.entries, TRANPOSESECOND, row_mapC_ref); - - ExecSpace().fence(); - - size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz(); - entriesC_ref = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); - valuesC_ref = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - - spgemm_numeric(&sequential_kh, m, n, k, crsMat.graph.row_map, - crsMat.graph.entries, crsMat.values, TRANPOSEFIRST, - - crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, - TRANPOSESECOND, row_mapC_ref, entriesC_ref, valuesC_ref); - ExecSpace().fence(); - - Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0), - valuesC_ref, row_mapC_ref, entriesC_ref); - } - - for (int i = 0; i < repeat; ++i) { - kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm)); - - kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output; - kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option); - - // if mkl2 input needs to be converted to 1base. - kh.get_spgemm_handle()->mkl_convert_to_1base = true; - - // 250000 default. if cache-mode is used on KNL can increase to 1M. - kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc; - - if (i == 0) { - kh.get_spgemm_handle()->set_read_write_cost_calc( - calculate_read_write_cost); - } - // do the compression whether in 2 step, or 1 step. - kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); - // whether to scale the hash more. default is 1, so no scale. - kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); - // max occupancy in 1-level LP hashes. LL hashes can be 100% - kh.get_spgemm_handle()->set_first_level_hash_cut_off( - params.first_level_hash_cut_off); - // min reduction on FLOPs to run compression - kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off); - - row_mapC = lno_view_t("non_const_lnow_row", m + 1); - entriesC = lno_nnz_view_t("entriesC (empty)", 0); - valuesC = scalar_view_t("valuesC (empty)", 0); - - Kokkos::Timer timer1; - spgemm_symbolic(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries, - TRANPOSEFIRST, crsMat2.graph.row_map, crsMat2.graph.entries, - TRANPOSESECOND, row_mapC); - - ExecSpace().fence(); - double symbolic_time = timer1.seconds(); - - Kokkos::Timer timer3; - size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); - if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl; - if (c_nnz_size) { - entriesC = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); - valuesC = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), - c_nnz_size); - } - spgemm_numeric(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries, - crsMat.values, TRANPOSEFIRST, - - crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, - TRANPOSESECOND, row_mapC, entriesC, valuesC); - ExecSpace().fence(); - double numeric_time = timer3.seconds(); - - std::cout << "mm_time:" << symbolic_time + numeric_time - << " symbolic_time:" << symbolic_time - << " numeric_time:" << numeric_time << std::endl; - } - if (verbose) { - std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; - std::cout << "entriesC:" << entriesC.extent(0) << std::endl; - std::cout << "valuesC:" << valuesC.extent(0) << std::endl; - KokkosKernels::Impl::print_1Dview(valuesC); - KokkosKernels::Impl::print_1Dview(entriesC); - KokkosKernels::Impl::print_1Dview(row_mapC); - } - crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, - row_mapC, entriesC); - if (check_output) { - bool is_identical = - is_same_matrix(Ccrsmat_result, Ccrsmat_ref); - if (!is_identical) { - std::cerr << "Result differs. If values are differing, might be floating " - "point order error." - << std::endl; - exit(1); - } - } - return Ccrsmat_result; -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp index a2004e007b..db4141368a 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp @@ -299,9 +299,9 @@ void run_spgemm_jacobi(Parameters params) { void, size_type> slow_crstmat_t; - char *a_mat_file = params.a_mtx_bin_file; - char *b_mat_file = params.b_mtx_bin_file; - char *c_mat_file = params.c_mtx_bin_file; + const char *a_mat_file = params.a_mtx_bin_file.c_str(); + const char *b_mat_file = params.b_mtx_bin_file.c_str(); + const char *c_mat_file = params.c_mtx_bin_file.c_str(); slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat; fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat; diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 13a7c26d2e..e8a0b19419 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -20,8 +20,12 @@ #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" #include "KokkosSparse_Utils_mkl.hpp" -#include "KokkosSparse_spadd.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include "KokkosSparse_spadd.hpp" + +using perf_test::CommonInputParams; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include @@ -32,17 +36,10 @@ #include #endif -#if defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) - -struct Params { - int use_cuda = 0; - int use_openmp = 0; - int use_threads = 0; - int use_mkl = 0; - int use_cusparse = 0; - bool sorted = true; +struct LocalParams { + bool use_mkl = false; + bool use_cusparse = false; + bool sorted = true; std::string amtx; std::string bmtx; std::string cmtx; @@ -55,17 +52,113 @@ struct Params { int numericRepeat = 1; // how many times to call numeric per overall run }; -template -void run_experiment(const Params& params) { +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; + std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; + std::cerr << "\t[Optional] --cmtx :: output matrix for C = A+B" + << std::endl; + std::cerr << "\t[Optional] --mkl :: run SpAdd from MKL" << std::endl; + std::cerr << "\t[Optional] --cusparse :: run SpAdd from cuSPARSE " + << std::endl; + std::cerr << "\t[Optional] --sorted :: sort rows of inputs, and run the " + "sorted algorithm" + << std::endl; + std::cerr << "\t[Optional] --unsorted :: run the unsorted algorithm" + << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "spadd (symbolic + repeated numeric)" + << std::endl; + std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat " + "numeric per symbolic" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A/B matrices" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --bdiag :: generate B as a diagonal matrix" + << std::endl; +} + +int parse_inputs(LocalParams& params, int argc, char** argv) { + bool printHelp = false; + bool discard; + for (int i = 1; i < argc; ++i) { + // if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { + // ++i; + if (perf_test::check_arg_bool(i, argc, argv, "--mkl", params.use_mkl)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--cusparse", + params.use_cusparse)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--sorted", + params.sorted)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--unsorted", + discard)) { + params.sorted = false; + } else if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { + // A at C=AxB + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--bmtx", params.bmtx)) { + // B at C=AxB. + // if not provided, C = AxA will be performed. + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--cmtx", params.cmtx)) { + // if provided, C will be written to given file. + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--nnz", + params.nnzPerRow)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--bdiag", + params.bDiag)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--numeric-repeat", + params.numericRepeat)) { + // Reuse the symbolic step this many times. + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + if (printHelp) { + print_options(); + return 1; + } + return 0; +} + +template +void run_experiment(int argc, char** argv, CommonInputParams) { using namespace KokkosSparse; using namespace KokkosSparse::Experimental; - using size_type = typename crsMat_t::size_type; - using lno_t = typename crsMat_t::ordinal_type; - using scalar_t = typename crsMat_t::value_type; - using device_t = typename crsMat_t::device_type; - using exec_space = typename device_t::execution_space; - using mem_space = typename device_t::memory_space; + using mem_space = typename exec_space::memory_space; + using device_t = typename Kokkos::Device; + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using crsMat_t = + KokkosSparse::CrsMatrix; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exec_space, mem_space, mem_space>; @@ -75,7 +168,30 @@ void run_experiment(const Params& params) { using entries_t = typename graph_t::entries_type::non_const_type; using values_t = typename crsMat_t::values_type::non_const_type; - std::cout << "************************************* \n"; + LocalParams params; + if (parse_inputs(params, argc, argv)) return; + + // First, make sure that requested TPL (if any) is actually available +#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if (params.use_mkl) + throw std::invalid_argument( + "To run MKL SpAdd, must enable the MKL TPL in cmake"); +#endif +#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + if (params.use_cusparse) + throw std::invalid_argument( + "To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake"); +#else + if (params.use_cusparse && !std::is_same::value) + throw std::invalid_argument( + "To run cuSPARSE SpAdd, must select the Cuda backend"); +#endif + + if (params.cmtx.length() && params.use_mkl) { + throw std::invalid_argument( + "If running MKL, can't output the result to file"); + } + std::cout << "************************************* \n"; crsMat_t A; crsMat_t B; @@ -212,14 +328,19 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if (params.use_mkl) { - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), - (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), - A.values.data())); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), - (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), - B.values.data())); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), + (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), + A.values.data())); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), + (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), + B.values.data())); + } else { + throw std::runtime_error( + "MKL configured with long long int not supported in Kokkos Kernels"); + } } #endif @@ -332,194 +453,8 @@ void run_experiment(const Params& params) { } } -void print_options() { - std::cerr << "Options\n" << std::endl; - - std::cerr - << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" - << std::endl; - - std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; - std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; - std::cerr << "\t[Optional] --cmtx :: output matrix for C = A+B" - << std::endl; - std::cerr << "\t[Optional] --mkl :: run SpAdd from MKL" << std::endl; - std::cerr << "\t[Optional] --cusparse :: run SpAdd from cuSPARSE " - << std::endl; - std::cerr << "\t[Optional] --sorted :: sort rows of inputs, and run the " - "sorted algorithm" - << std::endl; - std::cerr << "\t[Optional] --unsorted :: run the unsorted algorithm" - << std::endl; - std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " - "spadd (symbolic + repeated numeric)" - << std::endl; - std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat " - "numeric per symbolic" - << std::endl; - std::cerr << "\t[Optional] --verbose :: enable verbose output" - << std::endl; - std::cerr << "\nSettings for randomly generated A/B matrices" << std::endl; - std::cerr << "\t[Optional] --m :: number of rows to generate" - << std::endl; - std::cerr << "\t[Optional] --n :: number of cols to generate" - << std::endl; - std::cerr - << "\t[Optional] --nnz :: number of entries per row to generate" - << std::endl; - std::cerr - << "\t[Optional] --nnz :: number of entries per row to generate" - << std::endl; - std::cerr << "\t[Optional] --bdiag :: generate B as a diagonal matrix" - << std::endl; -} - -int parse_inputs(Params& params, int argc, char** argv) { - for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--mkl")) { - params.use_mkl = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--cusparse")) { - params.use_cusparse = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--sorted")) { - params.sorted = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--unsorted")) { - params.sorted = false; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { - // A at C=AxB - params.amtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) { - // B at C=AxB. - // if not provided, C = AxA will be performed. - params.bmtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) { - // if provided, C will be written to given file. - // has to have ".bin", or ".crs" extension. - params.cmtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { - params.m = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { - params.n = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { - params.nnzPerRow = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--bdiag")) { - params.bDiag = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - // if provided, C will be written to given file. - // has to have ".bin", or ".crs" extension. - params.repeat = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--numeric-repeat")) { - // Reuse the symbolic step this many times. - params.numericRepeat = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { - params.verbose = true; - } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; - print_options(); - return 1; - } - } - return 0; -} - +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" int main(int argc, char** argv) { - Params params; - - if (parse_inputs(params, argc, argv)) { - return 1; - } - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads - const int device_id = params.use_cuda - 1; - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - // Kokkos::print_configuration(std::cout); - - // First, make sure that requested TPL (if any) is actually available -#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) - if (params.use_mkl) - throw std::invalid_argument( - "To run MKL SpAdd, must enable the MKL TPL in cmake"); -#endif -#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - if (params.use_cusparse) - throw std::invalid_argument( - "To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake"); -#endif - - bool useOMP = params.use_openmp != 0; - bool useCUDA = params.use_cuda != 0; - - if (params.use_cusparse && !useCUDA) { - throw std::invalid_argument( - "To run cuSPARSE SpAdd, must supply the '--cuda ' flag"); - } - - if (params.cmtx.length() && params.use_mkl) { - throw std::invalid_argument( - "If running MKL, can't output the result to file"); - } - - bool useSerial = !useOMP && !useCUDA; - - if (useOMP) { -#if defined(KOKKOS_ENABLE_OPENMP) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: OpenMP requested, but not available.\n"; - return 1; -#endif - } - if (useCUDA) { -#if defined(KOKKOS_ENABLE_CUDA) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: CUDA requested, but not available.\n"; - return 1; -#endif - } - if (useSerial) { -#if defined(KOKKOS_ENABLE_SERIAL) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: Serial device requested, but not available.\n"; - return 1; -#endif - } - Kokkos::finalize(); - return 0; -} - -#else -int main() { -#if !defined(KOKKOSKERNELS_INST_DOUBLE) - std::cout << " not defined KOKKOSKERNELS_INST_DOUBLE" << std::endl; -#endif - -#if !defined(KOKKOSKERNELS_INST_OFFSET_INT) - std::cout << " not defined KOKKOSKERNELS_INST_OFFSET_INT" << std::endl; - -#endif - -#if !defined(KOKKOSKERNELS_INST_ORDINAL_INT) - std::cout << " not defined KOKKOSKERNELS_INST_ORDINAL_INT" << std::endl; - -#endif -} -#endif + return main_instantiation(argc, argv); +} // main diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index d46e9f6f11..cee68ef11a 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -16,22 +16,119 @@ #include #include "KokkosKernels_config.h" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_multimem_spgemm.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spgemm.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "KokkosBlas1_nrminf.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosKernels_TestParameters.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#define TRANSPOSEFIRST false +#define TRANSPOSESECOND false + +template +bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + size_t nrows1 = output_mat_actual.graph.row_map.extent(0); + size_t ncols1 = output_mat_actual.graph.row_map.extent(0); + size_t nentries1 = output_mat_actual.graph.entries.extent(0); + size_t nvals1 = output_mat_actual.values.extent(0); + + size_t nrows2 = output_mat_reference.graph.row_map.extent(0); + size_t ncols2 = output_mat_reference.graph.row_map.extent(0); + size_t nentries2 = output_mat_reference.graph.entries.extent(0); + size_t nvals2 = output_mat_reference.values.extent(0); + + if (nrows1 != nrows2 || ncols1 != ncols2) { + std::cerr << "Wrong dimensions: is " << nrows1 << 'x' << ncols1 + << " but should be " << nrows2 << 'x' << ncols2 << '\n'; + return false; + } + if (nentries1 != nentries2) { + std::cerr << "Wrong number of entries: " << nentries1 + << ", but should have " << nentries2 << '\n'; + return false; + } + if (nvals1 != nvals2) { + std::cerr << "Wrong number of values: " << nvals1 << ", but should have " + << nvals2 << '\n'; + return false; + } + + bool is_identical = true; + is_identical = KokkosKernels::Impl::kk_is_identical_view< + typename graph_t::row_map_type, typename graph_t::row_map_type, + typename lno_view_t::value_type, typename device::execution_space>( + output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); + if (!is_identical) { + std::cerr << "Wrong rowmap:\n"; + KokkosKernels::Impl::print_1Dview(std::cerr, + output_mat_actual.graph.row_map); + std::cerr << "but should be:\n"; + KokkosKernels::Impl::print_1Dview(std::cerr, + output_mat_reference.graph.row_map); + return false; + } + + is_identical = KokkosKernels::Impl::kk_is_identical_view< + lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, + typename device::execution_space>(output_mat_actual.graph.entries, + output_mat_reference.graph.entries, 0); + if (!is_identical) { + for (size_t i = 0; i < nrows1; ++i) { + size_t rb = output_mat_actual.graph.row_map(i); + size_t re = output_mat_actual.graph.row_map(i + 1); + bool incorrect = false; + for (size_t j = rb; j < re; ++j) { + if (output_mat_actual.graph.entries(j) != + output_mat_reference.graph.entries(j)) { + incorrect = true; + break; + } + } + if (incorrect) { + for (size_t j = rb; j < re; ++j) { + std::cerr << "row:" << i << " j:" << j + << " h_ent1(j):" << output_mat_actual.graph.entries(j) + << " h_ent2(j):" << output_mat_reference.graph.entries(j) + << " rb:" << rb << " re:" << re << std::endl; + } + } + } + std::cerr << "Wrong entries, see above." << std::endl; + return false; + } + + scalar_view_t valueDiff( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "spgemm values diff"), + output_mat_actual.values.extent(0)); + Kokkos::deep_copy(valueDiff, output_mat_actual.values); + KokkosBlas::axpy(-1.0, output_mat_reference.values, valueDiff); + auto maxDiff = KokkosBlas::nrminf(valueDiff); + + std::cout + << "Absolute maximum difference between actual and reference C values: " + << maxDiff << '\n'; + + return true; +} void print_options() { std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl; - std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip " - "[hipDeviceIndex]' --> if none are specified, Serial is used " - "(if enabled)" - << std::endl; std::cerr << "\t[Optional] '--algorithm " "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE]' --> to choose algorithm. " @@ -47,158 +144,113 @@ void print_options() { "250k, which is max k value to choose dense accumulators. This " "can be increased with more memory bandwidth." << std::endl; - std::cerr - << "\tThe memory space used for each matrix: '--memspaces [0|1|....15]' " - "--> Bits representing the use of HBM for Work, C, B, and A " - "respectively. For example 12 = 1100, will store work arrays and C on " - "HBM. A and B will be stored DDR. To use this enable multilevel " - "memory in Kokkos, check generate_makefile.sh" - << std::endl; - std::cerr << "\tLoop scheduling: '--dynamic': Use this for dynamic " - "scheduling of the loops. (Better performance most of the time)" + std::cerr << "\t[Optional] '--dynamic': Use this for dynamic " + "loop scheduling. (Better performance most of the time)" + << std::endl; + std::cerr << "\t[Optional] '--verbose': detailed output about SpGEMM and the " + "output matrix" + << std::endl; + std::cerr << "\t[Optional] '--checkoutput': verify result against serial " + "reference implementation" << std::endl; - std::cerr << "\tVerbose Output: '--verbose'" << std::endl; -} - -static char* getNextArg(int& i, int argc, char** argv) { - i++; - if (i >= argc) { - std::cerr << "Error: expected additional command-line argument!\n"; - exit(1); - } - return argv[i]; } int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** argv) { + std::string algoStr; + bool printHelp; for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(getNextArg(i, argc, argv)) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - params.repeat = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--hashscale")) { - params.minhashscale = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) { - params.chunk_size = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) { - params.team_size = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) { - params.vector_size = atoi(getNextArg(i, argc, argv)); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--compression2step")) { - params.compression2step = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--shmem")) { - params.shmemsize = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) { - int memspaces = atoi(getNextArg(i, argc, argv)); - int memspaceinfo = memspaces; - std::cout << "memspaceinfo:" << memspaceinfo << std::endl; - if (memspaceinfo & 1) { - params.a_mem_space = 1; - std::cout << "Using HBM for A" << std::endl; - } else { - params.a_mem_space = 0; - std::cout << "Using DDR4 for A" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.b_mem_space = 1; - std::cout << "Using HBM for B" << std::endl; - } else { - params.b_mem_space = 0; - std::cout << "Using DDR4 for B" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.c_mem_space = 1; - std::cout << "Using HBM for C" << std::endl; - } else { - params.c_mem_space = 0; - std::cout << "Using DDR4 for C" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.work_mem_space = 1; - std::cout << "Using HBM for work memory space" << std::endl; - } else { - params.work_mem_space = 0; - std::cout << "Using DDR4 for work memory space" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--CRWC")) { - params.calculate_read_write_cost = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) { - params.coloring_input_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--COF")) { - params.coloring_output_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--CCO")) { + if (perf_test::check_arg_int(i, argc, argv, "--repeat", params.repeat)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--hashscale", + params.minhashscale)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--chunksize", + params.chunk_size)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--teamsize", + params.team_size)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--vectorsize", + params.vector_size)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--compression2step", + params.compression2step)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--shmem", + params.shmemsize)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--CRWC", + params.calculate_read_write_cost)) { + } else if (perf_test::check_arg_str(i, argc, argv, "--CIF", + params.coloring_input_file)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--COF", + params.coloring_output_file)) { + ++i; + } else if (perf_test::check_arg_double(i, argc, argv, "--CCO", + params.compression_cut_off)) { // if 0.85 set, if compression does not reduce flops by at least 15% // symbolic will run on original matrix. otherwise, it will compress the // graph and run symbolic on compressed one. - params.compression_cut_off = atof(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--FLHCO")) { + ++i; + } else if (perf_test::check_arg_double(i, argc, argv, "--FLHCO", + params.first_level_hash_cut_off)) { // if linear probing is used as hash, what is the max occupancy percantage // we allow in the hash. - params.first_level_hash_cut_off = atof(getNextArg(i, argc, argv)); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--flop")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--flop", + params.calculate_read_write_cost)) { // print flop statistics. only for the first repeat. - params.calculate_read_write_cost = 1; - } - - else if (0 == Test::string_compare_no_case(argv[i], "--mklsort")) { + // note: if either --CRWC or --flop is passed, this parameter is set to + // true + } else if (perf_test::check_arg_int(i, argc, argv, "--mklsort", + params.mkl_sort_option)) { // when mkl2 is run, the sort option to use. // 7:not to sort the output // 8:to sort the output - params.mkl_sort_option = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--mklkeepout")) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--mklkeepout", + params.mkl_keep_output)) { // mkl output is not kept. - params.mkl_keep_output = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--checkoutput")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--checkoutput", + params.check_output)) { // check correctness - params.check_output = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + } else if (perf_test::check_arg_str(i, argc, argv, "--amtx", + params.a_mtx_bin_file)) { // A at C=AxB - params.a_mtx_bin_file = getNextArg(i, argc, argv); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--bmtx", + params.b_mtx_bin_file)) { // B at C=AxB. // if not provided, C = AxA will be performed. - params.b_mtx_bin_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--cmtx", + params.c_mtx_bin_file)) { // if provided, C will be written to given file. // has to have ".bin", or ".crs" extension. - params.c_mtx_bin_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--dynamic", + params.use_dynamic_scheduling)) { // dynamic scheduling will be used for loops. // currently it is default already. // so has to use the dynamic schedulin. - params.use_dynamic_scheduling = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--DENSEACCMAX")) { + } else if (perf_test::check_arg_int(i, argc, argv, "--DENSEACCMAX", + params.MaxColDenseAcc)) { // on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen, // it uses dense accumulators for smaller matrices based on the size of // column (k) in B. Max column size is 250,000 for k to use dense // accumulators. this parameter overwrites this. with cache mode, or CPUs // with smaller thread count, where memory bandwidth is not an issue, this // cut-off can be increased to be more than 250,000 - params.MaxColDenseAcc = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { // print the timing and information about the inner steps. // if you are timing TPL libraries, for correct timing use verbose option, // because there are pre- post processing in these TPL kernel wraps. - params.verbose = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) { - char* algoStr = getNextArg(i, argc, argv); - + } else if (perf_test::check_arg_str(i, argc, argv, "--algorithm", + algoStr)) { if (0 == Test::string_compare_no_case(algoStr, "DEFAULT")) { params.algorithm = KokkosSparse::SPGEMM_KK; } else if (0 == Test::string_compare_no_case(algoStr, "KKDEFAULT")) { @@ -218,11 +270,14 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "Unrecognized value for --algorithm (argument #" << i + << "): " << argv[i] << std::endl; print_options(); return 1; } + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -230,96 +285,239 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } } + if (printHelp) { + print_options(); + return 1; + } return 0; } -int main(int argc, char** argv) { +template +void run_spgemm(int argc, char** argv, perf_test::CommonInputParams) { + using namespace KokkosSparse; + using namespace KokkosSparse::Experimental; + + using MemSpace = typename ExecSpace::memory_space; using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; + using device_t = Kokkos::Device; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, ExecSpace, MemSpace, MemSpace>; KokkosKernels::Experiment::Parameters params; if (parse_inputs(params, argc, argv)) { - return 1; + return; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a and b matrix files" << std::endl; print_options(); - return 0; + return; + } + + crsMat_t A, B, C; + + // read a and b matrices + + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.a_mtx_bin_file.c_str()); + + if ((params.b_mtx_bin_file == "" || + params.a_mtx_bin_file == params.b_mtx_bin_file)) { + std::cout << "B is not provided or is the same as A. Multiplying AxA." + << std::endl; + B = A; + } else { + B = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.b_mtx_bin_file.c_str()); + } + + int algorithm = params.algorithm; + int repeat = params.repeat; + int chunk_size = params.chunk_size; + + int shmemsize = params.shmemsize; + int team_size = params.team_size; + int use_dynamic_scheduling = params.use_dynamic_scheduling; + int verbose = params.verbose; + int calculate_read_write_cost = params.calculate_read_write_cost; + // char spgemm_step = params.spgemm_step; + int vector_size = params.vector_size; + int check_output = params.check_output; + int mkl_keep_output = params.mkl_keep_output; + // spgemm_step++; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename crsMat_t::row_map_type::non_const_type lno_view_t; + typedef typename crsMat_t::index_type::non_const_type lno_nnz_view_t; + + lno_view_t row_mapC; + lno_nnz_view_t entriesC; + scalar_view_t valuesC; + + KernelHandle kh; + kh.set_team_work_size(chunk_size); + kh.set_shmem_size(shmemsize); + kh.set_suggested_team_size(team_size); + kh.set_suggested_vector_size(vector_size); + + if (use_dynamic_scheduling) { + kh.set_dynamic_scheduling(true); } - if (params.b_mtx_bin_file == NULL) { - std::cout << "B is not provided. Multiplying AxA." << std::endl; + if (verbose) { + kh.set_verbose(true); } - const int num_threads = std::max(params.use_openmp, params.use_threads); - const int device_id = - params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - Kokkos::print_configuration(std::cout); - -#if defined(KOKKOS_ENABLE_OPENMP) - - if (params.use_openmp) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params); -#endif + const lno_t m = A.numRows(); + const lno_t n = B.numRows(); + const lno_t k = B.numCols(); + + if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl; + if (n < A.numCols()) { + std::cerr << "left.numCols():" << A.numCols() + << " right.numRows():" << B.numRows() << std::endl; + exit(1); } -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if (params.use_cuda) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::Cuda::memory_space>(params); - -#endif + + // The reference product (for verifying correctness) + // Don't allocate them if they won't be used, but they must be declared here. + lno_view_t row_mapC_ref; + lno_nnz_view_t entriesC_ref; + scalar_view_t valuesC_ref; + // Reference output has same type as actual output + crsMat_t C_ref; + + if (check_output) { + if (verbose) std::cout << "Running a reference algorithm" << std::endl; + row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1); + KernelHandle sequential_kh; + sequential_kh.set_team_work_size(chunk_size); + sequential_kh.set_shmem_size(shmemsize); + sequential_kh.set_suggested_team_size(team_size); + sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL); + + if (use_dynamic_scheduling) { + sequential_kh.set_dynamic_scheduling(true); + } + + spgemm_symbolic(&sequential_kh, m, n, k, A.graph.row_map, A.graph.entries, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, + TRANSPOSESECOND, row_mapC_ref); + + ExecSpace().fence(); + + size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz(); + entriesC_ref = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC_ref = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); + + spgemm_numeric(&sequential_kh, m, n, k, A.graph.row_map, A.graph.entries, + A.values, TRANSPOSEFIRST, + + B.graph.row_map, B.graph.entries, B.values, TRANSPOSESECOND, + row_mapC_ref, entriesC_ref, valuesC_ref); + ExecSpace().fence(); + + C_ref = crsMat_t("CorrectC", m, k, valuesC_ref.extent(0), valuesC_ref, + row_mapC_ref, entriesC_ref); } -#endif -#if defined(KOKKOS_ENABLE_HIP) - if (params.use_hip) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + for (int i = 0; i < repeat; ++i) { + kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm)); + + kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output; + kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option); + + // if mkl2 input needs to be converted to 1base. + kh.get_spgemm_handle()->mkl_convert_to_1base = true; + + // 250000 default. if cache-mode is used on KNL can increase to 1M. + kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc; + + if (i == 0) { + kh.get_spgemm_handle()->set_read_write_cost_calc( + calculate_read_write_cost); + } + // do the compression whether in 2 step, or 1 step. + kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); + // whether to scale the hash more. default is 1, so no scale. + kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); + // max occupancy in 1-level LP hashes. LL hashes can be 100% + kh.get_spgemm_handle()->set_first_level_hash_cut_off( + params.first_level_hash_cut_off); + // min reduction on FLOPs to run compression + kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off); + + row_mapC = lno_view_t("non_const_lnow_row", m + 1); + entriesC = lno_nnz_view_t("entriesC (empty)", 0); + valuesC = scalar_view_t("valuesC (empty)", 0); + + Kokkos::Timer timer1; + spgemm_symbolic(&kh, m, n, k, A.graph.row_map, A.graph.entries, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, + TRANSPOSESECOND, row_mapC); + + ExecSpace().fence(); + double symbolic_time = timer1.seconds(); + + Kokkos::Timer timer3; + size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl; + if (c_nnz_size) { + entriesC = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), + c_nnz_size); + } + spgemm_numeric(&kh, m, n, k, A.graph.row_map, A.graph.entries, A.values, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, B.values, + TRANSPOSESECOND, row_mapC, entriesC, valuesC); + + ExecSpace().fence(); + double numeric_time = timer3.seconds(); + + std::cout << "mm_time:" << symbolic_time + numeric_time + << " symbolic_time:" << symbolic_time + << " numeric_time:" << numeric_time << std::endl; } -#endif - -#if defined(KOKKOS_ENABLE_THREADS) - // If only serial is enabled (or no other device was specified), run with - // serial - if (params.use_threads) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace, - Kokkos::HostSpace>(params); + if (verbose) { + std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; + std::cout << "entriesC:" << entriesC.extent(0) << std::endl; + std::cout << "valuesC:" << valuesC.extent(0) << std::endl; + KokkosKernels::Impl::print_1Dview(valuesC); + KokkosKernels::Impl::print_1Dview(entriesC); + KokkosKernels::Impl::print_1Dview(row_mapC); } -#endif - -#if defined(KOKKOS_ENABLE_SERIAL) - // If only serial is enabled (or no other device was specified), run with - // serial - if (!params.use_openmp && !params.use_cuda && !params.use_threads) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace, - Kokkos::HostSpace>(params); + crsMat_t C_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, row_mapC, + entriesC); + if (check_output) { + bool is_identical = is_same_matrix(C_result, C_ref); + if (!is_identical) { + std::cerr << "SpGEMM result differs with reference implementation.\n"; + exit(1); + } else { + std::cerr << "SpGEMM result matches reference implementation.\n"; + } } -#endif - Kokkos::finalize(); + if (params.c_mtx_bin_file != "") { + KokkosSparse::sort_crs_matrix(C_result); - return 0; + KokkosSparse::Impl::write_graph_bin( + (lno_t)(C_result.numRows()), (size_type)(C_result.nnz()), + C_result.graph.row_map.data(), C_result.graph.entries.data(), + C_result.values.data(), params.c_mtx_bin_file.c_str()); + } } + +#define KOKKOSKERNELS_PERF_TEST_NAME run_spgemm +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char** argv) { + return main_instantiation(argc, argv); +} // main diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index bcb71e951a..0f705e1209 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -219,12 +219,12 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a and b matrix files" << std::endl; print_options(); return 0; } - if (params.b_mtx_bin_file == NULL) { + if (params.b_mtx_bin_file == "") { std::cout << "B is not provided. Multiplying AxA." << std::endl; } @@ -253,15 +253,9 @@ int main(int argc, char** argv) { #if defined(KOKKOS_ENABLE_CUDA) if (params.use_cuda) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE - KokkosKernels::Experiment::run_spgemm_jacobi< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else KokkosKernels::Experiment::run_spgemm_jacobi< size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::Cuda::memory_space>(params); -#endif } #endif diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp new file mode 100644 index 0000000000..aeaa37db96 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -0,0 +1,171 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +// Headers needed to create initial data +// and to check results at the end +#include +#include +#include "KokkosKernels_default_types.hpp" +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +// Headers for benchmark library +#include +#include "Benchmark_Context.hpp" + +// Headers for spmv +#include +#include + +namespace { + +struct spmv_parameters { + int N, offset; + std::string filename; + std::string alg; + std::string tpl; + + spmv_parameters(const int N_) + : N(N_), offset(0), filename(""), alg(""), tpl("") {} +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --repeat :: how many times to repeat overall test" + << std::endl; + std::cerr << " -n [N] :: generate a semi-random banded (band size " + "0.01xN)\n" + "NxN matrix with average of 10 entries per row." + << std::endl; + std::cerr << "\t[Optional] --alg :: the algorithm to run (default, " + "native, merge)" + << std::endl; + std::cerr + << "\t[Optional] --alg :: the algorithm to run (classic, merge)" + << std::endl; + std::cerr << "\t[Optional] --TPL :: when available and compatible with " + "alg, a TPL can be used (cusparse, rocsparse, MKL)" + << std::endl; + std::cerr + << " -f [file] : Read in Matrix Market formatted text file 'file'." + << std::endl; + std::cerr << " --offset [O] : Subtract O from every index.\n" + << " Useful in case the matrix market file is " + "not 0 based." + << std::endl; +} // print_options + +void parse_inputs(int argc, char** argv, spmv_parameters& params) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { + if ((params.alg != "") && (params.alg != "default") && + (params.alg != "native") && (params.alg != "merge")) { + throw std::runtime_error( + "--alg can only be an empty string, `default`, `native` or " + "`merge`!"); + } + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", params.tpl)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "-f", params.filename)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--offset", + params.offset)) { + ++i; + } else { + print_options(); + KK_USER_REQUIRE_MSG(false, "Unrecognized command line argument #" + << i << ": " << argv[i]); + } + } +} // parse_inputs + +template +void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { + using matrix_type = + KokkosSparse::CrsMatrix; + using mv_type = Kokkos::View; + + KokkosKernels::Experimental::Controls controls; + if ((inputs.alg == "default") || (inputs.alg == "native") || + (inputs.alg == "merge")) { + controls.setParameter("algorithm", inputs.alg); + } + + // Create test matrix + srand(17312837); + matrix_type A; + if (inputs.filename == "") { + int nnz = 10 * inputs.N; + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + inputs.N, inputs.N, nnz, 0, 0.01 * inputs.N); + } else { + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + inputs.filename.c_str()); + } + + // Create input vectors + mv_type x("X", A.numRows()); + mv_type y("Y", A.numCols()); + + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(x, rand_pool, 10); + Kokkos::fill_random(y, rand_pool, 10); + + // Run the actual experiments + for (auto _ : state) { + KokkosSparse::spmv(controls, KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); + Kokkos::fence(); + } +} + +} // namespace + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + std::string bench_name = "KokkosSparse_spmv"; + + // Set input parameters, default to random 100000x100000 + spmv_parameters inputs(100000); + parse_inputs(argc, argv, inputs); + + // Google benchmark will report the wrong n if an input file matrix is used. + KokkosKernelsBenchmark::register_benchmark_real_time( + bench_name.c_str(), run_spmv, {"n"}, + {inputs.N}, common_params.repeat, inputs); + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + + return 0; +} diff --git a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp index c4deec656f..65120a8827 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp @@ -66,7 +66,7 @@ bool check_errors(mag_t tol, crsmat_t &Mtx, scalar_view_t rhs, using lno_t = typename entries_view_t::non_const_value_type; using values_view_t = typename crsmat_t::values_type::non_const_type; using scalar_t = typename values_view_t::value_type; - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using execution_space = typename scalar_view_t::execution_space; diff --git a/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp b/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp index 87afbba79a..5de4e6be00 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp @@ -153,7 +153,7 @@ template int test_sptrsv_perf(std::vector tests, std::string &filename, bool u_in_csr, bool invert_diag, bool invert_offdiag, int block_size, int loop) { - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using mag_type = typename STS::mag_type; // using cholmod_int_type = long; diff --git a/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp b/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp index cc9d698554..659874a32c 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp @@ -308,7 +308,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, int relax_size, int block_size, int loop) { using ordinal_type = int; using size_type = int; - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using mag_type = typename STS::mag_type; // Default spaces diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index 7ddd6957a9..7301d5e741 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -52,7 +52,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, bool invert_offdiag, bool u_in_csr, int loop) { using ordinal_type = int; using size_type = int; - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using mag_type = typename STS::mag_type; // Default spaces diff --git a/perf_test/sparse/spmv/Kokkos_SPMV.hpp b/perf_test/sparse/spmv/Kokkos_SPMV.hpp index a79e49b764..6668511c4a 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV.hpp @@ -25,7 +25,7 @@ struct SPMV_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type alpha; AMatrix m_A; diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index 14ab6f8ebe..4e099e6f96 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -28,7 +28,7 @@ struct SPMV_Inspector_Functor { typedef typename AMatrix::non_const_size_type size_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type alpha; AMatrix m_A; diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 117da595a7..98c5db89df 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -77,6 +77,8 @@ print_help() { echo "" echo "--no-default-eti: Do not include default ETI types for Kokkos Kernels" echo "" + echo "--disable-test-eti-only: Do not restrict testing to ETI types for Kokkos Kernels" + echo "" echo "--with-spaces=SPACES: Set spaces to be instantiated." echo " Options: hostspace, cudaspace, cudauvmspace" echo "" @@ -153,6 +155,10 @@ if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name module load git fi +if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name + MACHINE=solo +fi + if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then MACHINE=kokkos-dev-2 fi @@ -170,6 +176,14 @@ if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name MACHINE=caraway fi +if [[ "$HOSTNAME" == fat* ]]; then # Caraway MI250 queues + MACHINE=caraway +fi + +if [[ "$HOSTNAME" == lean* ]]; then # Caraway MI210 queues + MACHINE=caraway +fi + if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then MACHINE=kokkos-dev fi @@ -224,6 +238,7 @@ SKIP_HWLOC=False SPOT_CHECK=False NO_DEFAULT_ETI=False ENABLE_PERFTESTS=True +ENABLE_TEST_ETI_ONLY=True PRINT_HELP=False OPT_FLAG="" @@ -388,6 +403,9 @@ do --no-default-eti*) NO_DEFAULT_ETI=True ;; + --disable-test-eti-only*) + ENABLE_TEST_ETI_ONLY=False + ;; --disable-perftests*) ENABLE_PERFTESTS=False ;; @@ -577,63 +595,37 @@ elif [ "$MACHINE" = "inouye" ]; then SPACK_HOST_ARCH="+a64fx" elif [ "$MACHINE" = "weaver" ]; then - MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" + # Use the legacy env for now until all modules are part of the new system + MODULE_ENVIRONMENT="source /projects/ppc64le-pwr9-rhel8/legacy-env.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True -# used with rhel7W queue - BASE_MODULE_LIST="cmake/3.23.1,/" - CUDA_MODULE_LIST="cmake/3.23.1,/,ibm/xl/16.1.1,gcc/7.2.0" - CUDA10_MODULE_LIST="cmake/3.23.1,/,ibm/xl/16.1.1,gcc/7.4.0" - - GCC72_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.2.20/gcc/7.2.0" - GCC74_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.2.20/gcc/7.2.0,gcc/7.4.0" GCC93_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0,gcc/9.3.0" - CUDA_MODULE_TPL_LIST="cmake/3.23.1,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.23.1,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" - - # Issues finding CUBLAS with cuda/10.1.243 module at configure - # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" - # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS -# "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" CLANG13_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0,cuda/10.1.243" -# used with rhel8 queue - RHEL8_BASE_MODULE_LIST="cmake/3.23.1,/" - # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 load by default - RHEL8_CUDA11_MODULE_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0" + BASE_MODULE_LIST="cmake/3.23.1,/" + # Cuda/11 modules available rhel8 queue (rhel8 OS); gcc/8.3.1 load by default + RHEL8_CUDA11_MODULE_LIST="cmake/3.23.1,cuda/11.2.2/gcc/8.3.1,openblas/0.3.18/gcc/8.3.1" # Don't do Threads on weaver GCC_IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "cuda/10.1.243 $CUDA10_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/11.2.2/gcc/8.3.1 $RHEL8_CUDA11_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $GCC72_MODULE_TPL_LIST "Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/7.4.0 $GCC74_MODULE_TPL_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "cuda/9.2.88 $CUDA_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/11.2.2/gcc/8.3.1 $RHEL8_CUDA11_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/9.3.0 $GCC93_MODULE_TPL_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "clang/13.0.0 $CLANG13_MODULE_TPL_LIST "Cuda" clang++ $CUDA_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.4.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.1 $RHEL8_BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/8.3.1 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.2.2/gcc/8.3.1 $RHEL8_CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/13.0.0 $CLANG13_MODULE_TPL_LIST $CUDA_IBM_BUILD_LIST clang++ $CUDA_WARNING_FLAGS" ) fi @@ -682,17 +674,26 @@ elif [ "$MACHINE" = "caraway" ]; then # output description and success based only on build succes; build time output (no run-time) BASE_MODULE_LIST="cmake/3.19.3,/" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - ) + if [ "$SPOT_CHECK_TPLS" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.2.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA908" @@ -707,10 +708,9 @@ elif [ "$MACHINE" = "blake" ]; then BASE_MODULE_LIST="cmake/3.19.3,/" BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" - BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/" + BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/,/oneAPI/hpc-toolkit/" ONEAPI_WARNING_FLAGS="" - GCC72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gcc/7.2.0" GCC102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21/gcc/10.2.0" if [ "$SPOT_CHECK" = "True" ]; then @@ -718,31 +718,26 @@ elif [ "$MACHINE" = "blake" ]; then # TODO: Failing toolchains: #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" + COMPILERS=("clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" "intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gcc/10.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/11.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - # TODO: Failing toolchains: - #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" - "intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gcc/10.2.0 $GCC102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else - COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.2.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" + "intel/2021.4.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" + "intel/2022.1.2 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" ) @@ -752,6 +747,40 @@ elif [ "$MACHINE" = "blake" ]; then ARCH_FLAG="--arch=SKX" fi SPACK_HOST_ARCH="+skx" +elif [ "$MACHINE" = "solo" ]; then + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + module load cmake/3.22.3 + + BASE_MODULE_LIST="cmake/3.22.3,/" + BASE_MODULE_LIST_LLVM="cmake/3.22.3,/,gnu/10.2.1" + BASE_MODULE_LIST_INTEL="cmake/3.22.3,gnu/8.2.1,/" + ONEAPI_WARNING_FLAGS="" + + GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" + + if [ "$SPOT_CHECK" = "True" ]; then + COMPILERS=( + "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" + ) + elif [ "$SPOT_CHECK_TPLS" = "True" ]; then + COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL,mkl/19.0.5.281 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" + ) + else + ###"clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + COMPILERS=( + "gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + ) + + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=BDW" + fi + SPACK_HOST_ARCH="+bdw" elif [ "$MACHINE" = "kokkos-dev-2" ]; then MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh ; module use /home/projects/x86-64/modulefiles/local" eval "$MODULE_ENVIRONMENT" @@ -1025,6 +1054,9 @@ setup_env() { NEW_TPL_LIST="cublas,cusparse," export KOKKOS_CUDA_OPTIONS="${KOKKOS_CUDA_OPTIONS},enable_lambda" fi + if [[ "$compiler" == rocm* ]]; then + NEW_TPL_LIST="rocblas,rocsparse," + fi # host tpls - use mkl with intel, else use host blas if [[ "$compiler" == intel* ]]; then NEW_TPL_LIST="mkl," @@ -1139,6 +1171,9 @@ single_build_and_test() { fi echo " module purge" &>> reload_modules.sh echo " module load $compiler_modules_list" &>> reload_modules.sh + echo " export OMP_NUM_THREADS=$omp_num_threads" &>> reload_modules.sh + echo " export OMP_PROC_BIND=$omp_proc_bind" &>> reload_modules.sh + echo " export OMP_PLACES=$omp_places" &>> reload_modules.sh echo "" &>> reload_modules.sh chmod +x reload_modules.sh @@ -1173,6 +1208,10 @@ single_build_and_test() { local extra_args="$extra_args --no-default-eti" fi + if [ "${ENABLE_TEST_ETI_ONLY}" = "False" ]; then + local extra_args="$extra_args --disable-test-eti-only" + fi + if [ "${ENABLE_PERFTESTS}" = "False" ]; then local extra_args="$extra_args --disable-perftests" fi diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 2fdcd740e2..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in deleted file mode 100644 index 9be44095f0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_GAUSS_SEIDEL_APPLY_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 1e3befcc89..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_GAUSS_SEIDEL_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 493740dfb2..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in deleted file mode 100644 index 980540f7ba..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -*/ - -namespace KokkosSparse { -namespace Impl { - -@SPARSE_GMRES_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 943b721880..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_PAR_ILUT_NUMERIC_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index c30fe10f82..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_PAR_ILUT_SYMBOLIC_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 43b1da79d1..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 131960272e..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in deleted file mode 100644 index 313f1a88d0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_JACOBI_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in deleted file mode 100644 index 2ca1ecf07b..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_NOREUSE_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index af422e6fe5..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 2f3870e948..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index fe5cc1bfa7..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_SPILUK_NUMERIC_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index bfffae9dc0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_SPILUK_SYMBOLIC_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in deleted file mode 100644 index 5a7977921d..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in +++ /dev/null @@ -1,28 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Experimental { -namespace Impl { -// clang-format off -@SPARSE_SPMV_BSRMATRIX_ETI_DECL_BLOCK@ -// clang-format on -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in deleted file mode 100644 index 14813536f0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in deleted file mode 100644 index af58d3e7fc..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in deleted file mode 100644 index 11ba625f3c..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_MV_STRUCT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in deleted file mode 100644 index a03fcf586e..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_STRUCT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in deleted file mode 100644 index aa3d2b2cef..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPTRSV_SOLVE_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 4c48c895a1..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPTRSV_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in deleted file mode 100644 index 5b24a276d0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_TRSV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 79ca6c778d..2d408f9440 100644 --- a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -377,6 +377,5 @@ struct BSPGEMM_NUMERIC< false, true>; //#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp b/sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp new file mode 100644 index 0000000000..7c232fc6ab --- /dev/null +++ b/sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp @@ -0,0 +1,147 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSR_TO_CRS_IMPL_HPP +#define KOKKOSSPARSE_BSR_TO_CRS_IMPL_HPP + +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +namespace KokkosSparse { + +namespace Impl { + +/*! \brief Create an equivalent point matrix from a Bsr matrix + The Crs and Bsr matrix do not have to be on the same device +*/ +template +Crs bsr_to_crs(const Bsr &bsr) { + using crs_device_type = typename Crs::device_type; + using crs_values_type = typename Crs::values_type; + using crs_index_type = typename Crs::index_type; + using crs_ordinal_type = typename Crs::non_const_ordinal_type; + using crs_scalar_type = typename Crs::non_const_value_type; + using crs_size_type = typename Crs::non_const_size_type; + + using crs_row_map_type = + Kokkos::View; + using bsr_ordinal_type = typename Bsr::non_const_ordinal_type; + + using bsr_size_type = typename Bsr::non_const_size_type; + + // determine what some output matrix parameter will be + const size_t bs = bsr.blockDim(); + const crs_ordinal_type crsNumRows = bsr.numRows() * bs; + const crs_ordinal_type crsNumCols = bsr.numCols() * bs; + const crs_size_type crsNnz = bsr.nnz() * bs * bs; + + // clone Bsr row map to host memory space + auto bRows = Kokkos::create_mirror_view(bsr.graph.row_map); + auto bInds = Kokkos::create_mirror_view(bsr.graph.entries); + auto bVals = Kokkos::create_mirror_view(bsr.values); + Kokkos::deep_copy(bRows, bsr.graph.row_map); + Kokkos::deep_copy(bInds, bsr.graph.entries); + Kokkos::deep_copy(bVals, bsr.values); + + using Entry = + std::pair; // {column, value} + using Row = std::vector; // all entries in a row + std::map rows; // entries in each row + + // sort entries in a row by column + auto by_col = [](const Entry &a, const Entry &b) { + return a.first < b.first; + }; + + // Convert BSR data into CRS rows + for (bsr_ordinal_type bRow = 0; bRow < bsr_ordinal_type(bsr.numRows()); + ++bRow) { + for (bsr_size_type bColIdx = bRows(bRow); bColIdx < bRows(bRow + 1); + ++bColIdx) { + const crs_ordinal_type bCol = bInds(bColIdx); + + // add all points in this block + for (bsr_size_type lr = 0; lr < bsr_size_type(bs); ++lr) { + const crs_ordinal_type cRow = bRow * bs + lr; + for (bsr_size_type lc = 0; lc < bsr_size_type(bs); ++lc) { + const crs_size_type cvi = bColIdx * bs * bs + lr * bs + lc; + const crs_ordinal_type cCol = bCol * bs + lc; + const crs_scalar_type cVal = bVals(cvi); + auto entry = std::make_pair(cCol, cVal); + + auto it = rows.find(cRow); + if (it == rows.end()) { + Row newRow; + newRow.push_back(entry); + rows[cRow] = newRow; + } else { + it->second.push_back(entry); + } + } + } + } + } + + // device and host views of Crs data + crs_row_map_type devCrsRows("crs row map", crsNumRows + 1); + crs_index_type devCrsIdx("crs columns", crsNnz); + crs_values_type devCrsVals("crs values", crsNnz); + auto hostCrsRows = Kokkos::create_mirror_view(devCrsRows); + auto hostCrsIdx = Kokkos::create_mirror_view(devCrsIdx); + auto hostCrsVals = Kokkos::create_mirror_view(devCrsVals); + + // convert to Crs format + crs_ordinal_type iRowMap = 0; + crs_size_type nentries = 0; + for (auto &kv : rows) { // iterating through rows in order + const crs_ordinal_type &row = kv.first; // block's position + Row &entries = kv.second; // non-zeros in the block + + // update row map if we've moved to a new row + for (; iRowMap < row; ++iRowMap) { + hostCrsRows(iRowMap + 1) = nentries; // row ends at entries so far + } + + // make sure crs points in each row are sorted by column + std::sort(entries.begin(), entries.end(), by_col); + + // add columns and values to Crs data + for (size_t i = 0; i < entries.size(); ++i, ++nentries) { + hostCrsIdx(nentries) = entries[i].first; + hostCrsVals(nentries) = entries[i].second; + } + } + // complete row map if last blocks are empty + for (; iRowMap < crsNumRows; ++iRowMap) { + hostCrsRows(iRowMap + 1) = nentries; + } + + // move to device + Kokkos::deep_copy(devCrsRows, hostCrsRows); + Kokkos::deep_copy(devCrsIdx, hostCrsIdx); + Kokkos::deep_copy(devCrsVals, hostCrsVals); + + // construct the resulting Crs matrix + Crs crs("", crsNumRows, crsNumCols, crsNnz, devCrsVals, devCrsRows, + devCrsIdx); + return crs; +} // bsr_to_crs + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_BSR_TO_CRS_IMPL_HPP \ No newline at end of file diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index ec6cb6f02d..501e71e3e7 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -247,16 +247,16 @@ class ClusterGaussSeidel { nnz_scalar_t _omega; - Team_PSGS( - const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, - const_scalar_nnz_view_t adj_vals_, x_value_array_type Xvector_, - y_value_array_type Yvector_, nnz_lno_t color_set_begin_, - nnz_lno_t color_set_end_, nnz_lno_persistent_work_view_t color_adj_, - nnz_lno_persistent_work_view_t cluster_offsets_, - nnz_lno_persistent_work_view_t cluster_verts_, - scalar_persistent_work_view_t inverse_diagonal_, - nnz_lno_t clusters_per_team_, - nnz_scalar_t omega_ = Kokkos::Details::ArithTraits::one()) + Team_PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, + const_scalar_nnz_view_t adj_vals_, x_value_array_type Xvector_, + y_value_array_type Yvector_, nnz_lno_t color_set_begin_, + nnz_lno_t color_set_end_, + nnz_lno_persistent_work_view_t color_adj_, + nnz_lno_persistent_work_view_t cluster_offsets_, + nnz_lno_persistent_work_view_t cluster_verts_, + scalar_persistent_work_view_t inverse_diagonal_, + nnz_lno_t clusters_per_team_, + nnz_scalar_t omega_ = Kokkos::ArithTraits::one()) : _xadj(xadj_), _adj(adj_), _adj_vals(adj_vals_), @@ -691,7 +691,7 @@ class ClusterGaussSeidel { _diagonals(diagonals_), num_total_rows(num_total_rows_), rows_per_team(rows_per_team_), - one(Kokkos::Details::ArithTraits::one()) {} + one(Kokkos::ArithTraits::one()) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t row_id) const { @@ -781,12 +781,12 @@ class ClusterGaussSeidel { } template - void apply( - x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, - bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - bool apply_forward = true, bool apply_backward = true, - bool /*update_y_vector*/ = true) { + void apply(x_value_array_type x_lhs_output_vec, + y_value_array_type y_rhs_input_vec, + bool init_zero_x_vector = false, int numIter = 1, + nnz_scalar_t omega = Kokkos::ArithTraits::one(), + bool apply_forward = true, bool apply_backward = true, + bool /*update_y_vector*/ = true) { auto gsHandle = get_gs_handle(); size_type nnz = entries.extent(0); diff --git a/sparse/impl/KokkosSparse_coo2crs_impl.hpp b/sparse/impl/KokkosSparse_coo2crs_impl.hpp new file mode 100644 index 0000000000..d00a6f34a9 --- /dev/null +++ b/sparse/impl/KokkosSparse_coo2crs_impl.hpp @@ -0,0 +1,280 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSSPARSE_COO2CRS_IMPL_HPP +#define KOKKOSSPARSE_COO2CRS_IMPL_HPP +// The unorderedmap changes necessary for this to work +// have not made it into Kokkos 4.0.00 pr 4.0.01 will +// need to see if it happens in 4.1.00 to have a final +// version check here. +#if KOKKOS_VERSION >= 40099 + +#include +#include "Kokkos_UnorderedMap.hpp" +#include "KokkosKernels_Utils.hpp" + +namespace KokkosSparse { +namespace Impl { +template +class Coo2Crs { + private: + using RowViewScalarType = typename RowViewType::value_type; + using ColViewScalarType = typename ColViewType::value_type; + using DataViewScalarType = typename DataViewType::value_type; + using CrsST = DataViewScalarType; + using CrsOT = RowViewScalarType; + using CrsET = typename DataViewType::execution_space; + using CrsMT = void; + using CrsSzT = ColViewScalarType; + using CrsType = CrsMatrix; + using CrsValsViewType = typename CrsType::values_type; + using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; + using CrsColIdViewType = typename CrsType::index_type; + + using UmapValueViewType = Kokkos::View; + using UmapOpTypes = + Kokkos::UnorderedMapInsertOpTypes; + using UmapOpType = typename UmapOpTypes::AtomicAdd; + + // Make public for Kokkos::View + public: + using UmapHasherType = typename Kokkos::pod_hash; + using UmapEqualToType = typename Kokkos::pod_equal_to; + using UmapType = Kokkos::UnorderedMap; + using UmapMemorySpace = typename UmapType::device_type::memory_space; + + // Public for kokkos policies + struct coo2crsRp1 {}; + struct rowmapRp1 {}; + struct copyTp1 {}; + struct copyRp1 {}; + + using copyTp1Pt = Kokkos::TeamPolicy; + using copyTp1MemberType = typename copyTp1Pt::member_type; + + private: + using CrsRowMapView = Kokkos::View; + using CrsRowMapAtomicView = + Kokkos::View>; + using CrsValuesView = Kokkos::View; + using CrsColIdsView = Kokkos::View; + + // Needed since Kokkos::Bitset cannot be accessed on the host + using BmapViewType = + Kokkos::View>; + using Bitset = Kokkos::Bitset; + + CrsRowMapView m_crs_row_map; + CrsRowMapAtomicView m_crs_row_map_tmp; + CrsValuesView m_crs_vals; + CrsColIdsView m_crs_col_ids; + UmapType *m_umaps; + BmapViewType m_capacity_bmap; + Bitset m_tuple_bmap; + UmapOpType m_insert_op; + CrsOT m_nrows; + CrsOT m_ncols; + RowViewType m_row; + ColViewType m_col; + DataViewType m_data; + CrsSzT m_nnz; + + int m_n_tuples; + + public: + KOKKOS_INLINE_FUNCTION + void operator()(const coo2crsRp1 &, const int &idx) const { + auto i = m_row(idx); + auto j = m_col(idx); + auto is_inserted = m_tuple_bmap.test(idx); + + if (i >= m_nrows || j >= m_ncols) { + Kokkos::abort("tuple is out of bounds"); + } else if (!is_inserted && i >= 0 && j >= 0) { + if (m_umaps[i].insert(j, m_data(idx), m_insert_op).failed()) { + m_capacity_bmap(i) = true; // hmap at index i reached capacity + } else { + m_tuple_bmap.set(idx); // checklist of inserted tuples + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copyRp1 &, const int &i) const { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UN + for (int j = 0; j < m_ncols; j++) { + if (m_umaps[i].exists(j)) { + auto umap_idx = m_umaps[i].find(j); + auto offset = m_crs_row_map_tmp(i)++; + m_crs_vals(offset) = m_umaps[i].value_at(umap_idx); + m_crs_col_ids(offset) = m_umaps[i].key_at(umap_idx); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copyTp1 &, const copyTp1MemberType &member) const { + auto row_idx = member.league_rank(); + auto cpy_beg = m_crs_row_map(row_idx); + auto cpy_end = m_crs_row_map(row_idx + 1); + auto cpy_len = cpy_end - cpy_beg; + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, cpy_len), + [&](const CrsOT &i) { + auto offset = i + cpy_beg; + m_crs_vals(offset) = m_umaps[i].value_at(i); + m_crs_col_ids(offset) = m_umaps[i].key_at(i); + }); + } + + Coo2Crs(DimType m, DimType n, RowViewType row, ColViewType col, + DataViewType data) { + m_n_tuples = data.extent(0); + m_nrows = m; + m_ncols = n; + m_row = row; + m_col = col; + m_data = data; + + typename UmapType::size_type arg_capacity_hint = + m_nrows > 0 ? (m_n_tuples / m_nrows / 4) : 16; + typename UmapType::hasher_type arg_hasher; + typename UmapType::equal_to_type arg_equal_to; + arg_capacity_hint = arg_capacity_hint < 16 ? 16 : arg_capacity_hint; + + // Record of whether capacity was reached in any unordered map + m_capacity_bmap = BmapViewType("m_capacity_bmap", m_nrows); + typename BmapViewType::HostMirror m_capacity_bmap_mirror = + Kokkos::create_mirror_view(m_capacity_bmap); + + // Track which tuples have been processed + m_tuple_bmap = Bitset(m_n_tuples); + + m_crs_row_map = CrsRowMapView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map"), + m_nrows + 1); + + // Memory management notes for `umap_ptrs` and `m_umaps`: + // `umap_ptrs` is a two dimensional array. The first dimension contains + // pointers to mixed-memory (host and device memory). The second + // dimension is the array of UnorderedMap objects. Some of the object + // methods are callable from only the device (device-callable), others + // are callable from only the host. Some of the host-callable methods, + // such as rehash are intended to be observable on the device. + // See Kokkos::UnorderedMap for details. + // + // `m_umaps` is a single dimension array of device memory. This array + // contains a shallow copy of all the UnorderedMap members that are + // allocated manually below. + // + // Any time a host-callable method with device observable results is + // invoked, we must shallow-copy the given `umap_ptrs` member back to + // the device. + // + // However, since we are using shallow copies of objects of type + // UnorderedMap, we do not need to copy the device memory back to + // the host before using a host-callable method. + + // Setup a nrows length array of Unordered Maps + m_umaps = + reinterpret_cast(Kokkos::kokkos_malloc( + "m_umaps", m_nrows * sizeof(UmapType))); + + using shallow_copy_to_device = + Kokkos::Impl::DeepCopy; + + UmapType **umap_ptrs = new UmapType *[m_nrows]; + // TODO: use host-level parallel_for with tag rowmapRp1 + for (int i = 0; i < m_nrows; i++) { + umap_ptrs[i] = new UmapType(arg_capacity_hint, arg_hasher, arg_equal_to); + shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); + } + + using coo2crsRp1Pt = Kokkos::RangePolicy; + bool rehashed = true; + while (rehashed) { + Kokkos::parallel_for("coo2crsRp1", coo2crsRp1Pt(0, m_n_tuples), *this); + + CrsET().fence(); // Wait for bitmap writes to land + Kokkos::deep_copy(m_capacity_bmap_mirror, m_capacity_bmap); + CrsET().fence(); + + rehashed = false; + // TODO: covert to host-level parallel for. + for (int i = 0; i < m_nrows; i++) { + if (m_capacity_bmap_mirror(i)) { + umap_ptrs[i]->rehash(umap_ptrs[i]->capacity() * 2); + rehashed = true; + m_capacity_bmap_mirror(i) = false; + shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); + } + } + Kokkos::deep_copy(m_capacity_bmap, m_capacity_bmap_mirror); + CrsET().fence(); + } + + typename CrsRowMapView::HostMirror m_crs_row_map_h = + Kokkos::create_mirror_view(m_crs_row_map); + + // TODO: convert to host-level parallel_for / prefix sum + m_crs_row_map_h(0) = 0; + for (int i = 1; i < m_nrows + 1; i++) { + auto adj_i = i - 1; + auto sz = umap_ptrs[adj_i]->size(); + m_crs_row_map_h(i) = m_crs_row_map_h(adj_i) + sz; + } + + m_crs_row_map_tmp = CrsRowMapAtomicView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map_tmp"), + m_nrows + 1); + Kokkos::deep_copy(m_crs_row_map, m_crs_row_map_h); + Kokkos::deep_copy(m_crs_row_map_tmp, m_crs_row_map_h); + CrsET().fence(); + + m_nnz = m_crs_row_map_h(m_nrows); + + m_crs_vals = CrsValuesView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_vals"), m_nnz); + m_crs_col_ids = CrsColIdsView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_col_ids"), + m_nnz); + + using copyRp1Pt = Kokkos::RangePolicy; + Kokkos::parallel_for("copyRp1", copyRp1Pt(0, m_nrows), *this); + CrsET().fence(); + + // Cleanup + for (int i = 0; i < m_nrows; i++) { + delete umap_ptrs[i]; + } + delete[] umap_ptrs; + Kokkos::kokkos_free(m_umaps); + } + + CrsType get_crsMat() { + return CrsType("coo2crs", m_nrows, m_ncols, m_nnz, m_crs_vals, + m_crs_row_map, m_crs_col_ids); + } +}; +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOS_VERSION >= 40099 + +#endif // KOKKOSSPARSE_COO2CRS_IMPL_HPP diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp new file mode 100644 index 0000000000..8e4c187b99 --- /dev/null +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -0,0 +1,124 @@ +namespace KokkosSparse { + +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +namespace Impl { + +/*! \brief Expand each entry of a crs matrix to a block in a bsr matrix + The scalar, ordinal, and device types of the two matrices do not need + to be compatible +*/ +template +Bsr expand_crs_to_bsr(const Crs &crs, size_t blockSize) { + using bsr_device_type = typename Bsr::device_type; + using bsr_execution_space = typename Bsr::execution_space; + + using crs_values_type = typename Crs::values_type; + using bsr_values_type = typename Bsr::values_type; + + using crs_index_type = typename Crs::index_type; + using bsr_index_type = typename Bsr::index_type; + + using crs_row_map_type = typename Crs::row_map_type; + using bsr_row_map_type = + Kokkos::View; + + // construct the Bsr row map + bsr_row_map_type bsrRowMap("bsrRowMap", crs.graph.row_map.size()); + { + // clone Crs row map in Bsr memory space + Kokkos::View + crows("crows", crs.graph.row_map.size()); + Kokkos::deep_copy(crows, crs.graph.row_map); + + // copy to actual row map + Kokkos::RangePolicy policy(0, + crs.graph.row_map.size()); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(size_t i) { bsrRowMap(i) = crows(i); }); + } + + // construct the BSR col indices + bsr_index_type bsrIndices("bsrIndices", crs.graph.entries.size()); + { + // clone Crs row map in Bsr memory space + Kokkos::View + cinds("cinds", crs.graph.entries.size()); + Kokkos::deep_copy(cinds, crs.graph.entries); + + // copy to actual row map + Kokkos::RangePolicy policy(0, + crs.graph.entries.size()); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(size_t i) { bsrIndices(i) = cinds(i); }); + } + + // construct BSR values + bsr_values_type bsrVals("bsrVals", crs.nnz() * blockSize * blockSize); + { + // clone Crs values in Bsr memory space + Kokkos::View + cvals("cvals", crs.values.size()); + Kokkos::deep_copy(cvals, crs.values); + + // copy to actual values + Kokkos::RangePolicy policy(0, crs.values.size()); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(size_t i) { + for (size_t ii = i; ii < i + blockSize * blockSize; ++ii) { + bsrVals(ii) = cvals(i); + } + }); + } + + Bsr bsr("", crs.numRows(), crs.numCols(), crs.nnz(), bsrVals, bsrRowMap, + bsrIndices, blockSize); + return bsr; +} // expand_crs_to_bsr + +/*! \brief convert a crs already in block format to a Bsr matrix + */ +template +Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) { + using bsr_value_type = typename Bsr::value_type; + using bsr_ordinal_type = typename Bsr::ordinal_type; + + // copy matrix data to host + auto hRowMap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.row_map); + auto hColInds = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.entries); + auto hVals = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crs.values); + Kokkos::fence(); + + // construct COO data on host + std::vector vals; + std::vector rows, cols; + + vals.reserve(crs.nnz()); + rows.reserve(crs.nnz()); + cols.reserve(crs.nnz()); + + for (bsr_ordinal_type row = 0; row < bsr_ordinal_type(hRowMap.size()) - 1; + ++row) { + for (size_t ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) { + bsr_ordinal_type col = hColInds(ci); + bsr_value_type val = hVals(ci); + + rows.push_back(row); + cols.push_back(col); + vals.push_back(val); + } + } + + Bsr bsr("", crs.numRows(), crs.numCols(), crs.nnz(), vals.data(), rows.data(), + cols.data(), blockSize); + return bsr; +} // expand_crs_to_bsr + +} // namespace Impl +} // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 75f827a84d..e4cfb4b047 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -260,21 +260,20 @@ class PointGaussSeidel { // long rows. nnz_lno_t _long_row_par; - Team_PSGS( - row_lno_persistent_work_view_t xadj_, - nnz_lno_persistent_work_view_t adj_, - scalar_persistent_work_view_t adj_vals_, - scalar_persistent_work_view2d_t Xvector_, - scalar_persistent_work_view2d_t Yvector_, nnz_lno_t color_set_begin, - nnz_lno_t color_set_end, - scalar_persistent_work_view_t permuted_inverse_diagonal_, - pool_memory_space pms, nnz_lno_t _num_max_vals_in_l1 = 0, - nnz_lno_t _num_max_vals_in_l2 = 0, - nnz_scalar_t omega_ = Kokkos::Details::ArithTraits::one(), - - nnz_lno_t block_size_ = 1, nnz_lno_t team_work_size_ = 1, - size_t shared_memory_size_ = 16, int suggested_team_size_ = 1, - int vector_size_ = 1) + Team_PSGS(row_lno_persistent_work_view_t xadj_, + nnz_lno_persistent_work_view_t adj_, + scalar_persistent_work_view_t adj_vals_, + scalar_persistent_work_view2d_t Xvector_, + scalar_persistent_work_view2d_t Yvector_, + nnz_lno_t color_set_begin, nnz_lno_t color_set_end, + scalar_persistent_work_view_t permuted_inverse_diagonal_, + pool_memory_space pms, nnz_lno_t _num_max_vals_in_l1 = 0, + nnz_lno_t _num_max_vals_in_l2 = 0, + nnz_scalar_t omega_ = Kokkos::ArithTraits::one(), + + nnz_lno_t block_size_ = 1, nnz_lno_t team_work_size_ = 1, + size_t shared_memory_size_ = 16, int suggested_team_size_ = 1, + int vector_size_ = 1) : _xadj(xadj_), _adj(adj_), _adj_vals(adj_vals_), @@ -1283,7 +1282,7 @@ class PointGaussSeidel { rows_per_team(rows_per_team_), block_size(block_size_), block_matrix_size(block_matrix_size_), - one(Kokkos::Details::ArithTraits::one()) {} + one(Kokkos::ArithTraits::one()) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t& row_id) const { @@ -1489,7 +1488,7 @@ class PointGaussSeidel { void block_apply( x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), + nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { auto gsHandle = this->get_gs_handle(); @@ -1613,7 +1612,7 @@ class PointGaussSeidel { void point_apply( x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), + nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { auto gsHandle = get_gs_handle(); @@ -1690,12 +1689,12 @@ class PointGaussSeidel { } template - void apply( - x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, - bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - bool apply_forward = true, bool apply_backward = true, - bool update_y_vector = true) { + void apply(x_value_array_type x_lhs_output_vec, + y_value_array_type y_rhs_input_vec, + bool init_zero_x_vector = false, int numIter = 1, + nnz_scalar_t omega = Kokkos::ArithTraits::one(), + bool apply_forward = true, bool apply_backward = true, + bool update_y_vector = true) { auto gsHandle = get_gs_handle(); if (gsHandle->is_numeric_called() == false) { this->initialize_numeric(); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 2f6bb4d9b4..f04ae34fc9 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -509,8 +509,5 @@ struct GAUSS_SEIDEL_APPLY; #include -#include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp index b1ec07e768..91145335f5 100644 --- a/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp +++ b/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp @@ -75,7 +75,7 @@ struct CrsMatrixGetDiagCopyWithOffsetsFunctor { /// \param lclRow [in] The current (local) row of the sparse matrix. KOKKOS_INLINE_FUNCTION void operator()(const LO& lclRow) const { const offset_type INV = KokkosSparse::OrdinalTraits::invalid(); - const scalar_type ZERO = Kokkos::Details::ArithTraits::zero(); + const scalar_type ZERO = Kokkos::ArithTraits::zero(); // If the row lacks a stored diagonal entry, then its value is zero. D_(lclRow) = ZERO; diff --git a/sparse/impl/KokkosSparse_gmres_spec.hpp b/sparse/impl/KokkosSparse_gmres_spec.hpp index f0498e5efe..bfe1c4539a 100644 --- a/sparse/impl/KokkosSparse_gmres_spec.hpp +++ b/sparse/impl/KokkosSparse_gmres_spec.hpp @@ -158,6 +158,5 @@ struct GMRES; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index b8a25485f5..d8754e591c 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -17,9 +17,23 @@ #ifndef KOKKOSSPARSE_MDF_IMPL_HPP_ #define KOKKOSSPARSE_MDF_IMPL_HPP_ +#include +#include +#include "Kokkos_ArithTraits.hpp" + namespace KokkosSparse { namespace Impl { +template +struct MDF_types { + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + using scalar_mag_type = typename KAS::mag_type; + using values_mag_type = Kokkos::View; +}; + template struct MDF_count_lower { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -54,27 +68,28 @@ struct MDF_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; - - const scalar_type zero = KAS::zero(); + using values_type = typename crs_matrix_type::values_type::non_const_type; + using values_mag_type = typename MDF_types::values_mag_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + using scalar_mag_type = typename KAS::mag_type; + using KAM = typename Kokkos::ArithTraits; crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type deficiency; int verbosity; MDF_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, ordinal_type factorization_step_, col_ind_type permutation_, - values_type discarded_fill_, col_ind_type deficiency_, - int verbosity_) + values_mag_type discarded_fill_, + col_ind_type deficiency_, int verbosity_) : A(A_), At(At_), factorization_step(factorization_step_), @@ -85,10 +100,11 @@ struct MDF_discarded_fill_norm { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(i); - scalar_type discard_norm = zero, diag_val = zero; - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; + ordinal_type rowIdx = permutation(i); + scalar_mag_type discard_norm = KAM::zero(); + scalar_type diag_val = KAS::zero(); + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; for (size_type alphaIdx = At.graph.row_map(rowIdx); alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { ordinal_type fillRowIdx = At.graph.entries(alphaIdx); @@ -125,13 +141,15 @@ struct MDF_discarded_fill_norm { KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - int(At.graph.entries(alphaIdx)), - int(A.graph.entries(betaIdx)), - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), - int(rowIdx)); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + int(At.graph.entries(alphaIdx)), + int(A.graph.entries(betaIdx)), + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), + int(rowIdx)); + } } } } @@ -139,25 +157,34 @@ struct MDF_discarded_fill_norm { } else if (fillRowIdx == rowIdx) { diag_val = At.values(alphaIdx); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", int(rowIdx), - int(alphaIdx), At.values(alphaIdx)); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value detected, values(%d)=%f\n", int(rowIdx), + int(alphaIdx), At.values(alphaIdx)); + } else if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value detected, |values(%d)|=%f\n", + int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); + } } } } // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / (diag_val * diag_val); + discard_norm = discard_norm / KAS::abs(diag_val * diag_val); discarded_fill(rowIdx) = discard_norm; deficiency(rowIdx) = numFillEntries; - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAS::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); + + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAM::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } } } @@ -168,20 +195,21 @@ struct MDF_selective_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; - - const scalar_type zero = KAS::zero(); + using values_type = typename crs_matrix_type::values_type::non_const_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + using scalar_mag_type = typename KAS::mag_type; + using KAM = typename Kokkos::ArithTraits; + using values_mag_type = typename MDF_types::values_mag_type; crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; col_ind_type update_list; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type deficiency; int verbosity; @@ -189,7 +217,7 @@ struct MDF_selective_discarded_fill_norm { ordinal_type factorization_step_, col_ind_type permutation_, col_ind_type update_list_, - values_type discarded_fill_, + values_mag_type discarded_fill_, col_ind_type deficiency_, int verbosity_) : A(A_), At(At_), @@ -202,10 +230,11 @@ struct MDF_selective_discarded_fill_norm { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(update_list(i)); - scalar_type discard_norm = zero, diag_val = zero; - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; + ordinal_type rowIdx = permutation(update_list(i)); + scalar_mag_type discard_norm = KAM::zero(); + scalar_type diag_val = KAS::zero(); + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; for (size_type alphaIdx = At.graph.row_map(rowIdx); alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { ordinal_type fillRowIdx = At.graph.entries(alphaIdx); @@ -242,14 +271,16 @@ struct MDF_selective_discarded_fill_norm { KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - static_cast(At.graph.entries(alphaIdx)), - static_cast(A.graph.entries(betaIdx)), - static_cast( - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), - static_cast(rowIdx)); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + static_cast(At.graph.entries(alphaIdx)), + static_cast(A.graph.entries(betaIdx)), + static_cast( + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), + static_cast(rowIdx)); + } } } } @@ -257,26 +288,36 @@ struct MDF_selective_discarded_fill_norm { } else if (fillRowIdx == rowIdx) { diag_val = At.values(alphaIdx); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(At.values(alphaIdx))); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, values(%d)=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(At.values(alphaIdx))); + } else if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, |values(%d)|=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(KAS::abs(At.values(alphaIdx)))); + } } } } // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / (diag_val * diag_val); + discard_norm = discard_norm / KAS::abs(diag_val * diag_val); discarded_fill(rowIdx) = discard_norm; deficiency(rowIdx) = numFillEntries; - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAS::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); + + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAM::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } } } @@ -289,23 +330,24 @@ struct MDF_select_row { entries_type::non_const_type; using row_map_type = typename crs_matrix_type::StaticCrsGraphType::row_map_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using values_mag_type = typename MDF_types::values_mag_type; // type used to perform the reduction // do not confuse it with scalar_type! using value_type = typename crs_matrix_type::ordinal_type; value_type factorization_step; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type deficiency; row_map_type row_map; col_ind_type permutation; - MDF_select_row(value_type factorization_step_, values_type discarded_fill_, - col_ind_type deficiency_, row_map_type row_map_, - col_ind_type permutation_) + MDF_select_row(value_type factorization_step_, + values_mag_type discarded_fill_, col_ind_type deficiency_, + row_map_type row_map_, col_ind_type permutation_) : factorization_step(factorization_step_), discarded_fill(discarded_fill_), deficiency(deficiency_), @@ -399,10 +441,12 @@ struct MDF_factorize_row { row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using size_type = typename crs_matrix_type::size_type; - using value_type = typename crs_matrix_type::value_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + using value_type = typename crs_matrix_type::value_type; + using values_mag_type = typename MDF_types::values_mag_type; + using value_mag_type = typename values_mag_type::value_type; crs_matrix_type A, At; @@ -415,7 +459,7 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; @@ -426,7 +470,7 @@ struct MDF_factorize_row { values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, - values_type discarded_fill_, col_ind_type factored_, + values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, int verbosity_) : A(A_), @@ -448,7 +492,7 @@ struct MDF_factorize_row { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type /* idx */) const { const ordinal_type selected_row = permutation(selected_row_idx); - discarded_fill(selected_row) = Kokkos::ArithTraits::max(); + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); // Swap entries in permutation vectors permutation(selected_row_idx) = permutation(factorization_step); @@ -481,32 +525,34 @@ struct MDF_factorize_row { } } row_mapU(factorization_step + 1) = U_entryIdx; - - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - static_cast(selected_row), - static_cast(diag)); - } - - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); - for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(row_mapU(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesU(entryIdx))); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", + static_cast(selected_row), + static_cast(diag)); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesU(entryIdx))); + + if (verbosity > 2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); + for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; + ++rowIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(row_mapU(rowIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); + for (size_type entryIdx = row_mapU(0); + entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesU(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); + for (size_type entryIdx = row_mapU(0); + entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesU(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } // Insert the lower part of the selected column of A @@ -526,26 +572,28 @@ struct MDF_factorize_row { } row_mapL(factorization_step + 1) = L_entryIdx; - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - static_cast(factorization_step), - static_cast(factorization_step), - static_cast(factorization_step + 1), - static_cast(row_mapL(factorization_step)), - static_cast(row_mapL(factorization_step + 1))); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesL(entryIdx))); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", + static_cast(factorization_step), + static_cast(factorization_step), + static_cast(factorization_step + 1), + static_cast(row_mapL(factorization_step)), + static_cast(row_mapL(factorization_step + 1))); + for (size_type entryIdx = row_mapL(factorization_step); + entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesL(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); + for (size_type entryIdx = row_mapL(factorization_step); + entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesL(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } // If this was the last row no need to update A and At! @@ -599,13 +647,14 @@ struct MDF_factorize_row { if (A.graph.entries(entryIdx) == fillColIdx) { A.values(entryIdx) -= At.values(alphaIdx) * A.values(betaIdx) / diag_val; - - if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", static_cast(fillRowIdx), - static_cast(fillColIdx), - static_cast(At.values(alphaIdx) * - A.values(betaIdx) / diag_val)); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "A[%d, %d] -= %f\n", static_cast(fillRowIdx), + static_cast(fillColIdx), + static_cast(At.values(alphaIdx) * + A.values(betaIdx) / diag_val)); + } } } } @@ -624,19 +673,21 @@ struct MDF_factorize_row { factored(selected_row) = 1; - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); - for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(A.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); - for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(At.values(entryIdx))); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); + for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "%f ", static_cast(A.values(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); + for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "%f ", static_cast(At.values(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } } // operator() diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index fd3bc2b8bb..142f6dc912 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -239,6 +239,5 @@ struct PAR_ILUT_NUMERIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index b822d12ab0..512752d3d9 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -172,6 +172,5 @@ struct PAR_ILUT_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_sor_sequential_impl.hpp b/sparse/impl/KokkosSparse_sor_sequential_impl.hpp index 11cad03145..3ca5ee08bf 100644 --- a/sparse/impl/KokkosSparse_sor_sequential_impl.hpp +++ b/sparse/impl/KokkosSparse_sor_sequential_impl.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOSSPARSE_IMPL_SOR_HPP #define KOKKOSSPARSE_IMPL_SOR_HPP -/// \file Kokkos_Sparse_impl_sor.hpp +/// \file KokkosSparse_impl_sor.hpp /// \brief Sequential implementations of Gauss-Seidel and SOR. /// /// This file exists mainly as a temporary porting aid. Until we can @@ -77,7 +77,7 @@ void gaussSeidel(const LocalOrdinal numRows, const LocalOrdinal numCols, const OffsetType b_stride, RangeScalar* const X, const OffsetType x_stride, const MatrixScalar* const D, const MatrixScalar omega, const char direction[]) { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; typedef LocalOrdinal LO; const OffsetType theNumRows = static_cast(numRows); const OffsetType theNumCols = static_cast(numCols); @@ -247,7 +247,7 @@ void reorderedGaussSeidel( const MatrixScalar* const D, const LocalOrdinal* const rowInd, const LocalOrdinal numRowInds, // length of rowInd const MatrixScalar omega, const char direction[]) { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; typedef LocalOrdinal LO; const OffsetType theNumRows = static_cast(numRows); const OffsetType theNumCols = static_cast(numCols); @@ -323,7 +323,7 @@ void reorderedGaussSeidel( for (LO ii = 0; ii < numRowInds; ++ii) { LO i = rowInd[ii]; for (OffsetType c = 0; c < theNumCols; ++c) { - x_temp[c] = Kokkos::Details::ArithTraits::zero(); + x_temp[c] = Kokkos::ArithTraits::zero(); } for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) { const LO j = ind[k]; @@ -344,7 +344,7 @@ void reorderedGaussSeidel( for (LO ii = numRowInds - 1; ii != 0; --ii) { LO i = rowInd[ii]; for (OffsetType c = 0; c < theNumCols; ++c) { - x_temp[c] = Kokkos::Details::ArithTraits::zero(); + x_temp[c] = Kokkos::ArithTraits::zero(); } for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) { const LO j = ind[k]; @@ -362,7 +362,7 @@ void reorderedGaussSeidel( const LO ii = 0; LO i = rowInd[ii]; for (OffsetType c = 0; c < theNumCols; ++c) { - x_temp[c] = Kokkos::Details::ArithTraits::zero(); + x_temp[c] = Kokkos::ArithTraits::zero(); } for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) { const LO j = ind[k]; diff --git a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index 04fc372100..e81649f552 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -211,6 +211,5 @@ struct SPADD_NUMERIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index 13f70abfd0..aaab68568a 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -156,6 +156,5 @@ struct SPADD_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index ef03e0b786..dd1a7cd9b5 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1086,8 +1086,6 @@ struct KokkosSPGEMMhandle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = - this->handle->get_spgemm_handle()->get_max_result_nnz(); + this->handle->get_spgemm_handle()->get_max_result_nnz( + Kokkos::View(rowmapC, m + 1)); typedef KokkosKernels::Impl::UniformMemoryPool pool_memory_space; diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp index 5be268a4ef..d36457a893 100644 --- a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp @@ -291,6 +291,5 @@ struct SPGEMM_JACOBI; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp index 352e3384ac..5ade88cb83 100644 --- a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp @@ -158,6 +158,5 @@ struct SPGEMM_NOREUSE; #include -#include #endif // KOKKOSSPARSE_IMPL_SPGEMM_NOREUSE_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 21faea977c..b325f98796 100644 --- a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -251,6 +251,5 @@ struct SPGEMM_NUMERIC< false, true>; #include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index 671017133a..3a74fb231e 100644 --- a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -204,6 +204,5 @@ struct SPGEMM_SYMBOLIC; #include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 9436b67029..c2863885b2 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -489,6 +489,174 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } // end iluk_numeric +template +void iluk_numeric_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + using size_type = typename IlukHandle::size_type; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector lvl_ptr_h_v(nstreams); + std::vector lvl_idx_v(nstreams); // device views + std::vector lvl_start_v(nstreams); + std::vector lvl_end_v(nstreams); + std::vector iw_v(nstreams); // device views + std::vector stream_have_level_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); + lvl_idx_v[i] = thandle_v[i]->get_level_idx(); + iw_v[i] = thandle_v[i]->get_iw(); + stream_have_level_v[i] = true; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Assume all streams use the same algorithm + if (thandle_v[0]->get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + for (int i = 0; i < nstreams; i++) { + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) + stream_have_level_v[i] = true; + else + stream_have_level_v[i] = false; + } else + stream_have_level_v[i] = false; + } + + // Main work of the level across streams + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + ILUKLvlSchedRPNumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], + iw_v[i], lvl_start_v[i]); + Kokkos::parallel_for( + "parfor_rp", + Kokkos::RangePolicy(execspace_v[i], + lvl_start_v[i], lvl_end_v[i]), + tstf); + } // end if (stream_have_level_v[i]) + } // end for streams + } // end for lvl + } // end SEQLVLSCHD_RP + else if (thandle_v[0]->get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + + std::vector lvl_nchunks_h_v(nstreams); + std::vector lvl_nrowsperchunk_h_v(nstreams); + std::vector lvl_rowid_start_v(nstreams); + std::vector team_size_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); + lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); + team_size_v[i] = thandle_v[i]->get_team_size(); + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + nnz_lno_t lvl_nchunks_max = 0; + for (int i = 0; i < nstreams; i++) { + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { + stream_have_level_v[i] = true; + lvl_rowid_start_v[i] = 0; + if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) + lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); + } else + stream_have_level_v[i] = false; + } else + stream_have_level_v[i] = false; + } + + // Main work of the level across streams -- looping through chunnks + for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { + // 1. Launch work on all streams (for each chunk) + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + // Launch only if stream i-th has this chunk + if (chunkid < lvl_nchunks_h_v[i](lvl)) { + // 1.a. Specify number of rows (i.e. number of teams) to launch + nnz_lno_t lvl_nrows_chunk = 0; + if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > + (lvl_end_v[i] - lvl_start_v[i])) + lvl_nrows_chunk = + (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + else + lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); + + // 1.b. Create functor for stream i-th and launch + ILUKLvlSchedTP1NumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, + LEntriesType, LValuesType, URowMapType, UEntriesType, + UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + lvl_idx_v[i], iw_v[i], + lvl_start_v[i] + lvl_rowid_start_v[i]); + if (team_size_v[i] == -1) + Kokkos::parallel_for( + "parfor_tp1", + policy_type(execspace_v[i], lvl_nrows_chunk, Kokkos::AUTO), + tstf); + else + Kokkos::parallel_for( + "parfor_tp1", + policy_type(execspace_v[i], lvl_nrows_chunk, + team_size_v[i]), + tstf); + + // 1.c. Ready to move to next chunk + lvl_rowid_start_v[i] += lvl_nrows_chunk; + } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) + } // end if (stream_have_level_v[i]) + } // end for streams + } // end for chunkid + } // end for lvl + } // end SEQLVLSCHD_TP1 + +} // end iluk_numeric_streams + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index ec711a3b17..12f8c43caf 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -31,10 +31,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spiluk_numeric_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct spiluk_numeric_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spiluk_numeric_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -99,18 +100,18 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::spiluk_numeric -template ::value, + ExecutionSpace, KernelHandle, ARowMapType, AEntriesType, + AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, + UEntriesType, UValuesType>::value, bool eti_spec_avail = spiluk_numeric_eti_spec_avail< - KernelHandle, ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType>::value> + ExecutionSpace, KernelHandle, ARowMapType, AEntriesType, + AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, + UEntriesType, UValuesType>::value> struct SPILUK_NUMERIC { static void spiluk_numeric( KernelHandle *handle, @@ -119,18 +120,30 @@ struct SPILUK_NUMERIC { const AValuesType &A_values, LRowMapType &L_row_map, LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map, UEntriesType &U_entries, UValuesType &U_values); + static void spiluk_numeric_streams( + const std::vector &execspace_v, + std::vector &handle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spiluk_numeric // Unification layer -template -struct SPILUK_NUMERIC +struct SPILUK_NUMERIC { static void spiluk_numeric( KernelHandle *handle, @@ -146,6 +159,30 @@ struct SPILUK_NUMERIC &execspace_v, + std::vector &handle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + std::vector spiluk_handle_v( + execspace_v.size()); + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + spiluk_handle_v[i] = handle_v[i].get_spiluk_handle(); + } + + Experimental::iluk_numeric_streams(execspace_v, spiluk_handle_v, + A_row_map_v, A_entries_v, A_values_v, + L_row_map_v, L_entries_v, L_values_v, + U_row_map_v, U_entries_v, U_values_v); + } }; #endif @@ -163,6 +200,7 @@ struct SPILUK_NUMERIC, \ @@ -208,6 +246,7 @@ struct SPILUK_NUMERIC, \ @@ -250,6 +289,5 @@ struct SPILUK_NUMERIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 616e87f154..9521420bfb 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -97,7 +97,7 @@ template (free_byte, total_byte); - avail_byte = static_cast(0.85 * free_byte); + avail_byte = static_cast(0.85 * static_cast(free_byte) / + static_cast(nstreams)); } #endif @@ -174,7 +175,8 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, } else #endif { - lnchunks(i) = 1; + // Workaround to fix unused-parameter nstreams error + lnchunks(i) = static_cast(nstreams / nstreams); lnrowsperchunk(i) = lnrows; } if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { @@ -225,7 +227,7 @@ void iluk_symbolic(IlukHandle& thandle, const ARowMapType& A_row_map_d, const AEntriesType& A_entries_d, LRowMapType& L_row_map_d, LEntriesType& L_entries_d, URowMapType& U_row_map_d, - UEntriesType& U_entries_d) { + UEntriesType& U_entries_d, int nstreams = 1) { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP || thandle.get_algorithm() == @@ -433,7 +435,7 @@ void iluk_symbolic(IlukHandle& thandle, if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, nlev); + level_idx, nlev, nstreams); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp index 86f018886c..9d8f410918 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp @@ -100,7 +100,7 @@ struct SPILUK_SYMBOLIC { const typename KernelHandle::const_nnz_lno_t &fill_lev, const ARowMapType &A_row_map, const AEntriesType &A_entries, LRowMapType &L_row_map, LEntriesType &L_entries, URowMapType &U_row_map, - UEntriesType &U_entries); + UEntriesType &U_entries, int nstreams = 1); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -117,11 +117,12 @@ struct SPILUK_SYMBOLICget_spiluk_handle(); Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries, - L_row_map, L_entries, U_row_map, U_entries); + L_row_map, L_entries, U_row_map, U_entries, + nstreams); spiluk_handle->set_symbolic_complete(); } }; @@ -203,6 +204,5 @@ struct SPILUK_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 1cd4241eae..abf44589f7 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -528,7 +528,7 @@ struct BSR_GEMV_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; @@ -816,7 +816,7 @@ struct BSR_GEMV_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; @@ -1143,7 +1143,7 @@ struct BSR_GEMM_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; @@ -1449,7 +1449,7 @@ struct BSR_GEMM_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 13b88b3271..678aaaa0c5 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -150,14 +150,13 @@ struct SPMV_BSRMATRIX::is_complex) - method = Method::Fallback; - if (Kokkos::Details::ArithTraits::is_complex) - method = Method::Fallback; - if (Kokkos::Details::ArithTraits::is_complex) - method = Method::Fallback; + if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; + if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; + if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; // can't use tensor cores outside GPU if (!KokkosKernels::Impl::kk_is_gpu_exec_space< typename AMatrix::execution_space>()) @@ -295,14 +291,13 @@ struct SPMV_MV_BSRMATRIX; #include -#include -#include #endif // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index d3e495c426..6a82977e02 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -58,7 +58,7 @@ struct SPMV_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; typedef typename YVector::non_const_value_type coefficient_type; typedef typename YVector::non_const_value_type y_value_type; @@ -118,7 +118,7 @@ struct SPMV_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type alpha; AMatrix m_A; @@ -512,24 +512,10 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, #if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || \ defined(KOKKOS_ENABLE_THREADS) { - int impl_thread_pool_size(0); -#if defined(KOKKOS_ENABLE_SERIAL) - if (std::is_same::value) - impl_thread_pool_size = 1; -#endif -#if defined(KOKKOS_ENABLE_OPENMP) - if (std::is_same::value) - impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size(); -#endif -#if defined(KOKKOS_ENABLE_THREADS) - if (std::is_same::value) - impl_thread_pool_size = Kokkos::Threads::impl_thread_pool_size(); -#endif - - if (impl_thread_pool_size == 1) { + if (execution_space().concurrency() == 1) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const size_type* KOKKOS_RESTRICT row_map_ptr = A.graph.row_map.data(); const ordinal_type* KOKKOS_RESTRICT col_idx_ptr = A.graph.entries.data(); const value_type* KOKKOS_RESTRICT values_ptr = A.values.data(); @@ -715,8 +701,7 @@ struct SPMV_MV_Transpose_Functor { for (ordinal_type iEntry = 0; iEntry < row_length; iEntry++) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) + conjugate ? Kokkos::ArithTraits::conj(row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); @@ -758,10 +743,9 @@ struct SPMV_MV_Transpose_Functor { Kokkos::ThreadVectorRange(dev, row_length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate - ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate ? Kokkos::ArithTraits::conj( + row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); if (doalpha != 1) { @@ -835,7 +819,7 @@ struct SPMV_MV_LayoutLeft_Functor { #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero(); + sum[k] = Kokkos::ArithTraits::zero(); } const auto row = m_A.rowConst(iRow); @@ -848,9 +832,9 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::parallel_for( Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -925,7 +909,7 @@ struct SPMV_MV_LayoutLeft_Functor { #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero(); + sum[k] = Kokkos::ArithTraits::zero(); } const auto row = m_A.rowConst(iRow); @@ -937,8 +921,7 @@ struct SPMV_MV_LayoutLeft_Functor { for (ordinal_type iEntry = 0; iEntry < row.length; iEntry++) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) + conjugate ? Kokkos::ArithTraits::conj(row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -982,9 +965,9 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry), 0); }, sum); @@ -1018,8 +1001,7 @@ struct SPMV_MV_LayoutLeft_Functor { y_value_type sum = y_value_type(); for (ordinal_type iEntry = 0; iEntry < row.length; iEntry++) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) + conjugate ? Kokkos::ArithTraits::conj(row.value(iEntry)) : row.value(iEntry); sum += val * m_x(row.colidx(iEntry), 0); } @@ -1502,7 +1484,7 @@ void spmv_alpha_mv(const char mode[], const typename YVector::non_const_value_type& beta, const YVector& y) { typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { spmv_alpha_beta_mv(mode, alpha, A, x, diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index d196265b23..95cd022159 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -200,7 +200,7 @@ struct SPMV KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { if (beta != KAT::one()) { @@ -240,7 +240,7 @@ struct SPMV_MV KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { spmv_alpha_mv(mode, alpha, A, x, beta, y); @@ -353,9 +353,7 @@ struct SPMV_MV; #include -#include #include -#include #endif // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 2831cb8861..8f217e05aa 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -37,7 +37,7 @@ struct SPMV_Struct_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; typedef typename YVector::non_const_value_type coefficient_type; typedef typename YVector::non_const_value_type y_value_type; @@ -102,7 +102,7 @@ struct SPMV_Struct_Functor { typedef typename KokkosSparse::SparseRowViewConst row_view_const; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; typedef Kokkos::View > shared_ordinal_1d; @@ -521,7 +521,7 @@ struct SPMV_Struct_Functor { const size_type rowOffset = m_A.graph.row_map(rowIdx); y_value_type sum(0.0); -#if defined(KOKKOS_IF_ON_HOST) + // clang-format off KOKKOS_IF_ON_HOST(( for (ordinal_type idx = 0; idx < 27; ++idx) { @@ -540,25 +540,7 @@ struct SPMV_Struct_Functor { }, sum); )) - // clang-format on -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) // FIXME remove when - // requiring minimum - // version of - // Kokkos 3.6 - for (ordinal_type idx = 0; idx < 27; ++idx) { - sum += - m_A.values(rowOffset + idx) * m_x(rowIdx + columnOffsets(idx)); - } -#else - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(dev, 27), - [&](const ordinal_type& idx, y_value_type& lclSum) { - lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) - : m_A.values(rowOffset + idx)) * - m_x(rowIdx + columnOffsets(idx)); - }, - sum); -#endif + // clang-format on Kokkos::single(Kokkos::PerThread(dev), [&]() { m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum; @@ -997,10 +979,9 @@ struct SPMV_MV_Struct_Transpose_Functor { Kokkos::ThreadVectorRange(dev, row_length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate - ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate ? Kokkos::ArithTraits::conj( + row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); if (doalpha != 1) { @@ -1072,7 +1053,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero(); + sum[k] = Kokkos::ArithTraits::zero(); } const auto row = m_A.rowConst(iRow); @@ -1080,9 +1061,9 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { Kokkos::parallel_for( Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -1157,9 +1138,9 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry), 0); }, sum); @@ -1483,7 +1464,7 @@ void spmv_alpha_mv_struct(const char mode[], const typename YVector::non_const_value_type& beta, const YVector& y) { typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { spmv_alpha_beta_mv_struct( diff --git a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index fde9bf4dcf..9b22278db2 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -201,9 +201,9 @@ struct SPMV_STRUCT& structure, const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { if (beta != KAT::one()) { @@ -242,7 +242,7 @@ struct SPMV_MV_STRUCT KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { spmv_alpha_mv_struct(mode, alpha, A, x, @@ -357,7 +357,5 @@ struct SPMV_MV_STRUCT; #include -#include -#include #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spmv_team_impl.hpp b/sparse/impl/KokkosSparse_spmv_team_impl.hpp new file mode 100644 index 0000000000..622dd4997c --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_team_impl.hpp @@ -0,0 +1,134 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_TEAM_IMPL_HPP_ +#define KOKKOSSPARSE_SPMV_TEAM_IMPL_HPP_ + +#include +#include +#include +#include + +namespace KokkosSparse { +namespace Impl { + +struct TeamSpmvInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const OrdinalType numRows, + const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0); +}; + +struct TeamVectorSpmvInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const OrdinalType numRows, + const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0); +}; + +template +KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke( + const MemberType& member, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numRows), + [&](const OrdinalType& iRow) { + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess0] * + x[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs0]; + } + + sum *= alpha; + + if (dobeta == 0) { + y[iRow * ys0] = sum; + } else { + y[iRow * ys0] = beta * y[iRow * ys0] + sum; + } + }); + return 0; +} + +template +KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( + const MemberType& member, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numRows), + [&](const OrdinalType& iRow) { + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + + ValueType sum = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, rowLength), + [&](const OrdinalType& iEntry, ValueType& val) { + val += values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess0] * + x[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs0]; + }, + sum); + + sum *= alpha; + + if (dobeta == 0) { + y[iRow * ys0] = sum; + } else { + y[iRow * ys0] = beta * y[iRow * ys0] + sum; + } + }); + return 0; +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/impl/KokkosSparse_spmv_team_spec.hpp b/sparse/impl/KokkosSparse_spmv_team_spec.hpp new file mode 100644 index 0000000000..156123b113 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_team_spec.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_TEAM_SPEC_HPP_ +#define KOKKOSSPARSE_SPMV_TEAM_SPEC_HPP_ + +#include +#include +#include +#include +#include + +namespace KokkosSparse { + +template +struct TeamSpmv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& x, const ScalarType beta, + const yViewType& y) { + return Impl::TeamSpmvInternal::invoke< + MemberType, ScalarType, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, dobeta>( + member, x.extent(0), alpha, values.data(), values.stride_0(), + row_ptr.data(), row_ptr.stride_0(), colIndices.data(), + colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); + } +}; + +template +struct TeamVectorSpmv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& x, const ScalarType beta, + const yViewType& y) { + return Impl::TeamVectorSpmvInternal::invoke< + MemberType, ScalarType, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, dobeta>( + member, x.extent(0), alpha, values.data(), values.stride_0(), + row_ptr.data(), row_ptr.stride_0(), colIndices.data(), + colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); + } +}; + +} // namespace KokkosSparse + +#endif diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 17611c3f2c..7605f03fa2 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -25,7 +25,7 @@ namespace Impl { template -void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, +void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -58,28 +58,28 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; - void* rm; + void *rm; // NOTE (Oct-29-2022): // cusparseCreateCsr only supports the same sizes (either 32 bits or 64 // bits) for row_map_type and entries_type if (std::is_same::value) { if (!std::is_same::value) { sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - rm = (void*)sptrsv_handle->get_int_rowmap_ptr_copy(row_map); + rm = (void *)sptrsv_handle->get_int_rowmap_ptr_copy(row_map); } else { - rm = (void*)row_map.data(); + rm = (void *)row_map.data(); } } else { // idx_type has 64 bits if (!std::is_same::value) { sptrsv_handle->allocate_tmp_int64_rowmap(row_map.extent(0)); - rm = (void*)sptrsv_handle->get_int64_rowmap_ptr_copy(row_map); + rm = (void *)sptrsv_handle->get_int64_rowmap_ptr_copy(row_map); } else { - rm = (void*)row_map.data(); + rm = (void *)row_map.data(); } } const scalar_type alpha = scalar_type(1.0); @@ -93,8 +93,8 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, // Create sparse matrix in CSR format KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( &(h->matDescr), static_cast(nrows), - static_cast(nrows), nnz, rm, (void*)entries.data(), - (void*)values.data(), cudaCsrRowMapType, cudaCsrColIndType, + static_cast(nrows), nnz, rm, (void *)entries.data(), + (void *)values.data(), cudaCsrRowMapType, cudaCsrColIndType, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) @@ -132,7 +132,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, h->spsvDescr, &pBufferSize)); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void**)&(h->pBuffer), pBufferSize)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&(h->pBuffer), pBufferSize)); // Run analysis KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_analysis( @@ -284,7 +284,7 @@ template < typename KernelHandle, typename ain_row_index_view_type, typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, +void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -320,7 +320,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, "CUSPARSE requires local ordinals to be integer (32 bits or 64 " "bits).\n"); } else { - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); const scalar_type alpha = scalar_type(1.0); @@ -330,12 +330,12 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, // Create dense vector B (RHS) KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr), static_cast(nrows), - (void*)rhs.data(), cudaValueType)); + (void *)rhs.data(), cudaValueType)); // Create dense vector X (LHS) KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr), static_cast(nrows), - (void*)lhs.data(), cudaValueType)); + (void *)lhs.data(), cudaValueType)); // Solve KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve( @@ -436,6 +436,198 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, #endif } +// -------------------------------- +// Stream interface +// -------------------------------- + +template +void sptrsvcuSPARSE_solve_streams( + const std::vector &execspace_v, + std::vector &handle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v, bool /*trans*/ +) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + using idx_type = typename KernelHandle::nnz_lno_t; + using size_type = typename KernelHandle::size_type; + using scalar_type = typename KernelHandle::nnz_scalar_t; + using memory_space = typename KernelHandle::HandlePersistentMemorySpace; + using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; + using sptrsvCuSparseHandleType = + typename sptrsvHandleType::SPTRSVcuSparseHandleType; + + int nstreams = execspace_v.size(); +#if (CUDA_VERSION >= 11030) + (void)row_map_v; + (void)entries_v; + (void)values_v; + + const bool is_cuda_space = + std::is_same::value || + std::is_same::value || + std::is_same::value; + + const bool is_idx_type_supported = std::is_same::value || + std::is_same::value; + + if constexpr (!is_cuda_space) { + throw std::runtime_error( + "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " + "IN GPU DEVICE for CUSPARSE\n"); + } else if constexpr (!is_idx_type_supported) { + throw std::runtime_error( + "CUSPARSE requires local ordinals to be integer (32 bits or 64 " + "bits).\n"); + } else { + const scalar_type alpha = scalar_type(1.0); + + cudaDataType cudaValueType = cuda_data_type_from(); + + std::vector h_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + sptrsvHandleType *sptrsv_handle = handle_v[i].get_sptrsv_handle(); + h_v[i] = sptrsv_handle->get_cuSparseHandle(); + + // Bind cuspare handle to a stream + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); + + int64_t nrows = static_cast(sptrsv_handle->get_nrows()); + + // Create dense vector B (RHS) + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( + &(h_v[i]->vecBDescr), nrows, (void *)rhs_v[i].data(), cudaValueType)); + + // Create dense vector X (LHS) + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( + &(h_v[i]->vecXDescr), nrows, (void *)lhs_v[i].data(), cudaValueType)); + } + + // Solve + for (int i = 0; i < nstreams; i++) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve( + h_v[i]->handle, h_v[i]->transpose, &alpha, h_v[i]->matDescr, + h_v[i]->vecBDescr, h_v[i]->vecXDescr, cudaValueType, + CUSPARSE_SPSV_ALG_DEFAULT, h_v[i]->spsvDescr)); + } + + // Destroy dense vector descriptors + for (int i = 0; i < nstreams; i++) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h_v[i]->vecBDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h_v[i]->vecXDescr)); + } + } +#else // CUDA_VERSION < 11030 + const bool is_cuda_space = + std::is_same::value || + std::is_same::value || + std::is_same::value; + + if constexpr (!is_cuda_space) { + throw std::runtime_error( + "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " + "IN GPU DEVICE for CUSPARSE\n"); + } else if constexpr (!std::is_same::value) { + throw std::runtime_error( + "CUSPARSE requires local ordinals to be integer.\n"); + } else { + const scalar_type alpha = scalar_type(1.0); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); + h_v[i] = sptrsv_handle_v[i]->get_cuSparseHandle(); + + // Bind cuspare handle to a stream + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); + + if (h_v[i]->pBuffer == nullptr) { + std::cout << " pBuffer invalid on stream " << i << std::endl; + } + rm_v[i] = !std::is_same::value + ? sptrsv_handle_v[i]->get_int_rowmap_ptr() + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); + vals_v[i] = values_v[i].data(); + bv_v[i] = rhs_v[i].data(); + xv_v[i] = lhs_v[i].data(); + } + + for (int i = 0; i < nstreams; i++) { + int nnz = entries_v[i].extent_int(0); + int nrows = static_cast(sptrsv_handle_v[i]->get_nrows()); + if (std::is_same::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); + } else if (std::is_same::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); + } else if (std::is_same >::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); + } else if (std::is_same >::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); + } else { + throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); + } + } + } +#endif +#else + (void)execspace_v; + (void)handle_v; + (void)row_map_v; + (void)entries_v; + (void)values_v; + (void)rhs_v; + (void)lhs_v; + throw std::runtime_error("CUSPARSE IS NOT DEFINED\n"); +#endif +} + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 4cff646325..e2a625e2a7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -103,15 +103,13 @@ struct TriLvlSchedTP1SolverFunctor { long node_count; // like "block" offset into ngbl, my_league is the "local" // offset - long dense_nrows; TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, - const bool is_lowertri_, long node_count_, - long dense_nrows_ = 0) + const bool &is_lowertri_, const long &node_count_) : row_map(row_map_), entries(entries_), values(values_), @@ -119,8 +117,7 @@ struct TriLvlSchedTP1SolverFunctor { rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), - node_count(node_count_), - dense_nrows(dense_nrows_) {} + node_count(node_count_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -398,7 +395,7 @@ struct LowerTriLvlSchedRPSolverFunctor { const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - NGBLType nodes_grouped_by_level_) + const NGBLType &nodes_grouped_by_level_) : row_map(row_map_), entries(entries_), values(values_), @@ -412,11 +409,11 @@ struct LowerTriLvlSchedRPSolverFunctor { // Assuming indices are sorted per row, diag entry is final index in the // list - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); auto rhs_rowid = rhs(rowid); - for (auto ptr = soffset; ptr < eoffset; ++ptr) { + for (long ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { @@ -430,12 +427,12 @@ struct LowerTriLvlSchedRPSolverFunctor { KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const lno_t i) const { auto rowid = nodes_grouped_by_level(i); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); auto rhs_rowid = rhs(rowid); auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { + for (long ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { @@ -2962,6 +2959,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::Timer sptrsv_timer; sptrsv_timer.reset(); #endif + for (size_type lvl = 0; lvl < nlevels; ++lvl) { { size_type lvl_nodes = hnodes_per_level(lvl); @@ -3252,6 +3250,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // scope for if-block } // end for lvl + #ifdef profile_supernodal_etree Kokkos::fence(); double sptrsv_time_seconds = sptrsv_timer.seconds(); @@ -4019,6 +4018,182 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, } // end tri_solve_chain +// -------------------------------- +// Stream interfaces +// -------------------------------- + +template +void lower_tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + using size_type = typename TriSolveHandle::size_type; + using NGBLType = typename TriSolveHandle::nnz_lno_view_t; + using nodes_per_level_type = + typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::RangePolicy( + execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + LowerTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + int team_size = thandle_v[i]->get_team_size(); +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], true, + node_count_v[i]); +#else + LowerTriLvlSchedTP1SolverFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); +#endif + if (team_size == -1) + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + } // end for lvl +} // end lower_tri_solve_streams + +template +void upper_tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + using size_type = typename TriSolveHandle::size_type; + using NGBLType = typename TriSolveHandle::nnz_lno_view_t; + using nodes_per_level_type = + typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::RangePolicy( + execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + UpperTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + int team_size = thandle_v[i]->get_team_size(); +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], false, + node_count_v[i]); +#else + UpperTriLvlSchedTP1SolverFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); +#endif + if (team_size == -1) + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + } // end for lvl +} // end upper_tri_solve_streams + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index fce10e3acd..e36b9df236 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -31,8 +31,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sptrsv_solve_eti_spec_avail { enum : bool { value = false }; }; @@ -45,6 +45,7 @@ struct sptrsv_solve_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct sptrsv_solve_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -83,29 +84,39 @@ namespace Impl { #endif // Unification layer -/// \brief Implementation of KokkosSparse::sptrsv_solve - -template ::value, - bool eti_spec_avail = - sptrsv_solve_eti_spec_avail::value> +/// \brief Implementations of KokkosSparse::sptrsv_solve and +/// \brief KokkosSparse::sptrsv_solve_streams + +template ::value, + bool eti_spec_avail = sptrsv_solve_eti_spec_avail< + ExecutionSpace, KernelHandle, RowMapType, EntriesType, ValuesType, + BType, XType>::value> struct SPTRSV_SOLVE { static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, const EntriesType entries, const ValuesType values, BType b, XType x); + + static void sptrsv_solve_streams( + const std::vector &execspace_v, + std::vector &handle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, const std::vector &b_v, + std::vector &x_v); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -//! Full specialization of sptrsv_solve +//! Full specialization of sptrsv_solve and sptrsv_solve_streams // Unification layer -template -struct SPTRSV_SOLVE { +template +struct SPTRSV_SOLVE { static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, const EntriesType entries, const ValuesType values, BType b, XType x) { @@ -155,6 +166,48 @@ struct SPTRSV_SOLVE &execspace_v, + std::vector &handle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, const std::vector &b_v, + std::vector &x_v) { + // Call specific algorithm type + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment + // Assume streams have the same either lower or upper matrix type + std::vector sptrsv_handle_v( + execspace_v.size()); + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); + } + Kokkos::Profiling::pushRegion(sptrsv_handle_v[0]->is_lower_tri() + ? "KokkosSparse_sptrsv[lower]" + : "KokkosSparse_sptrsv[upper]"); + if (sptrsv_handle_v[0]->is_lower_tri()) { + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { + Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + entries_v[i]); + } + } + Experimental::lower_tri_solve_streams(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, b_v, + x_v); + } else { + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { + Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + entries_v[i]); + } + } + Experimental::upper_tri_solve_streams(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, b_v, + x_v); + } + Kokkos::Profiling::popRegion(); + } }; #endif @@ -172,6 +225,7 @@ struct SPTRSV_SOLVE, \ @@ -200,6 +254,7 @@ struct SPTRSV_SOLVE, \ @@ -225,6 +280,5 @@ struct SPTRSV_SOLVE; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index eed6d3129a..3ef3be8780 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -16,7 +16,7 @@ #ifndef KOKKOSSPARSE_IMPL_SPTRSV_SYMBOLIC_HPP_ #define KOKKOSSPARSE_IMPL_SPTRSV_SYMBOLIC_HPP_ -/// \file Kokkos_Sparse_impl_sptrsv_symbolic.hpp +/// \file KokkosSparse_impl_sptrsv_symbolic.hpp /// \brief Implementation(s) of sparse triangular solve. #include diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index f13c25dec6..73389d10d0 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -144,6 +144,5 @@ struct SPTRSV_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index 2e2a706745..fbbd547e34 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOSSPARSE_IMPL_TRSM_HPP_ #define KOKKOSSPARSE_IMPL_TRSM_HPP_ -/// \file Kokkos_Sparse_impl_trsm.hpp +/// \file KokkosSparse_impl_trsm.hpp /// \brief Implementation(s) of sparse triangular solve. #include @@ -72,7 +72,7 @@ void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); // const local_ordinal_type numCols = A.numCols (); @@ -190,7 +190,7 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, typename CrsMatrixType::row_map_type ptr = A.graph.row_map; typename CrsMatrixType::index_type ind = A.graph.entries; typename CrsMatrixType::values_type val = A.values; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; // If local_ordinal_type is unsigned and numRows is 0, the loop // below will have entirely the wrong number of iterations. @@ -425,7 +425,7 @@ void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); @@ -486,7 +486,7 @@ void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); @@ -600,7 +600,7 @@ void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); @@ -638,7 +638,7 @@ void lowerTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); diff --git a/sparse/impl/KokkosSparse_trsv_spec.hpp b/sparse/impl/KokkosSparse_trsv_spec.hpp index ff4a6d90cd..2e838337d2 100644 --- a/sparse/impl/KokkosSparse_trsv_spec.hpp +++ b/sparse/impl/KokkosSparse_trsv_spec.hpp @@ -189,6 +189,5 @@ struct TRSV; #include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index a64d7f76a0..00fdcd2442 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -82,7 +82,7 @@ class TwostageGaussSeidel { using internal_vector_view_t = typename TwoStageGaussSeidelHandleType::vector_view_t; - using ST = Kokkos::Details::ArithTraits; + using ST = Kokkos::ArithTraits; using mag_t = typename ST::mag_type; private: @@ -407,7 +407,7 @@ class TwostageGaussSeidel { // functor for storing both valuesL & valuesU (with parallel_for) KOKKOS_INLINE_FUNCTION void operator()(const Tag_valuesLU &, const ordinal_t i) const { - const_scalar_t one = Kokkos::Details::ArithTraits::one(); + const_scalar_t one = Kokkos::ArithTraits::one(); ordinal_t nnzL = row_map(i); ordinal_t nnzU = row_map2(i); ordinal_t nnzLa = 0; @@ -851,8 +851,8 @@ class TwostageGaussSeidel { bool init_zero_x_vector = false, int numIter = 1, scalar_t omega = ST::one(), bool apply_forward = true, bool apply_backward = true, bool /*update_y_vector*/ = true) { - const_scalar_t one = Kokkos::Details::ArithTraits::one(); - const_scalar_t zero = Kokkos::Details::ArithTraits::zero(); + const_scalar_t one = Kokkos::ArithTraits::one(); + const_scalar_t zero = Kokkos::ArithTraits::zero(); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS double tic; Kokkos::Timer timer; diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 598cf9577d..dae3f12462 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -418,15 +418,15 @@ class KokkosKernelsHandle { return this->my_exec_space; } - /** - * \brief Returns the suggested team work size. If set with - * set_team_work_size, it will return the set value. Otherwise it will return - * the teamsize. \param team_size: input, team size used by the kernel. \param - * concurrency: input, the number of threads overall. Not used currently. - * \param overall_work_size: The overall work size. - */ - int get_team_work_size(const int team_size, const int /* concurrency */, - const nnz_lno_t /* overall_work_size */) { + /// \brief Returns the suggested team work size. If set with + /// set_team_work_size, it will return the set value. Otherwise it will return + /// the teamsize. + /// \param team_size input, team size used by the kernel. + /// \param concurrency filler for concurrency + /// \param overall_work_size filler for overall_work_size + int get_team_work_size(const int team_size, + [[maybe_unused]] const int concurrency, + [[maybe_unused]] const nnz_lno_t overall_work_size) { if (this->team_work_size != -1) { return this->team_work_size; } else { @@ -453,11 +453,10 @@ class KokkosKernelsHandle { */ bool is_dynamic_scheduling() { return this->use_dynamic_scheduling; } - /** - * \brief sets the shared memory size to be used by the kernels using shared - * memory on GPUs. \param shared_memory_size: input, shared memory size to be - * used by the kernel. * - */ + /// \brief sets the shared memory size to be used by the kernels using shared + /// memory on GPUs. + /// \param shared_memory_size_ input, shared memory size to be used by the + /// kernel. void set_shmem_size(const size_t shared_memory_size_) { this->shared_memory_size = shared_memory_size_; } @@ -500,10 +499,9 @@ class KokkosKernelsHandle { int get_set_suggested_team_size() { return this->suggested_team_size; } - /** - * \brief Returns the team size, either set by the user or suggested by the - * handle. \param vector_size: suggested vector size by the handle. - */ + /// \brief Returns the team size, either set by the user or suggested by the + /// handle. + /// \param vector_size_ suggested vector size by the handle. int get_suggested_team_size(const int vector_size_) { if (this->suggested_team_size != -1) { return this->suggested_team_size; diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index f27143039c..a366245a86 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -14,7 +14,7 @@ // //@HEADER -/// \file Kokkos_Sparse_BsrMatrix.hpp +/// \file KokkosSparse_BsrMatrix.hpp /// \brief Local sparse matrix interface /// /// This file provides KokkosSparse::Experimental::BsrMatrix. @@ -156,13 +156,12 @@ struct BsrRowView { } /// \brief Return offset into colidx_ for the requested block idx - /// If none found, return Kokkos::Details::ArithTraits::max + /// If none found, return Kokkos::ArithTraits::max /// \param idx_to_match [in] local block idx within block-row - /// \param is_sorted [in] defaulted to false; no usage at this time KOKKOS_INLINE_FUNCTION ordinal_type findRelBlockOffset(const ordinal_type idx_to_match, bool /*is_sorted*/ = false) const { - ordinal_type offset = Kokkos::Details::ArithTraits::max(); + ordinal_type offset = Kokkos::ArithTraits::max(); for (ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) { ordinal_type idx = colidx_[blk_offset]; if (idx == idx_to_match) { @@ -213,6 +212,7 @@ struct BsrRowViewConst { /// /// \param values [in] Array of the row's values. /// \param colidx [in] Array of the row's column indices. + /// \param blockDim [in] The block dimensions. /// \param count [in] Number of entries in the row. /// \param start [in] Offset into values and colidx of the desired block-row /// start. @@ -292,15 +292,14 @@ struct BsrRowViewConst { } /// \brief Return offset into colidx_ for the requested block idx - /// If none found, return Kokkos::Details::ArithTraits::max + /// If none found, return Kokkos::ArithTraits::max /// \param idx_to_match [in] local block idx within block-row - /// \param is_sorted [in] defaulted to false; no usage at this time KOKKOS_INLINE_FUNCTION ordinal_type findRelBlockOffset(const ordinal_type& idx_to_match, bool /*is_sorted*/ = false) const { typedef typename std::remove_cv::type non_const_ordinal_type; non_const_ordinal_type offset = - Kokkos::Details::ArithTraits::max(); + Kokkos::ArithTraits::max(); for (non_const_ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) { ordinal_type idx = colidx_[blk_offset]; @@ -333,6 +332,10 @@ class BsrMatrix { static_assert( std::is_signed::value, "BsrMatrix requires that OrdinalType is a signed integer type."); + static_assert(Kokkos::is_memory_traits_v || + std::is_void_v, + "BsrMatrix: MemoryTraits (4th template param) must be a Kokkos " + "MemoryTraits or void"); private: typedef @@ -390,6 +393,11 @@ class BsrMatrix { //! Nonconst version of the type of the entries in the sparse matrix. typedef typename values_type::non_const_value_type non_const_value_type; + // block values are actually a 1-D view, however they are implicitly + // arranged in LayoutRight, e.g. consecutive entries in the values view + // are consecutive entries within a row inside a block + using block_layout = Kokkos::LayoutRight; + /// \name Storage of the actual sparsity structure and values. /// /// BsrMatrix uses the compressed sparse row (CSR) storage format to @@ -452,12 +460,12 @@ class BsrMatrix { } } - /// \brief Constructor that copies raw arrays of host data in - /// coordinate format. + /// \brief Construct BsrMatrix from host data in COO format. /// - /// On input, each entry of the sparse matrix is stored in val[k], - /// with row index rows[k] and column index cols[k]. We assume that - /// the entries are sorted in increasing order by row index. + /// The COO matrix must already have a block structure. + /// Each entry k of the input sparse matrix has a value stored in val[k], + /// row index in rows[k] and column index in cols[k]. + /// The COO data must be sorted by increasing row index /// /// This constructor is mainly useful for benchmarking or for /// reading the sparse matrix's data from a file. @@ -466,18 +474,19 @@ class BsrMatrix { /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. - /// \param val [in] The entries. - /// \param rows [in] The row indices. rows[k] is the row index of + /// \param vals [in] The entries. + /// \param rows [in] The row indices. rows[k] is the row index of /// val[k]. - /// \param cols [in] The column indices. cols[k] is the column + /// \param cols [in] The column indices. cols[k] is the column /// index of val[k]. + /// \param blockdim [in] The block size of the constructed BsrMatrix. /// \param pad [in] If true, pad the sparse matrix's storage with /// zeros in order to improve cache alignment and / or /// vectorization. /// /// The \c pad argument is currently not used. BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, - size_type annz, ScalarType* val, OrdinalType* rows, + size_type annz, ScalarType* vals, OrdinalType* rows, OrdinalType* cols, OrdinalType blockdim, bool pad = false) { (void)label; (void)pad; @@ -489,120 +498,158 @@ class BsrMatrix { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((ncols % blockDim_ != 0) || (nrows % blockDim_ != 0)) { - assert( - (ncols % blockDim_ == 0) && - "BsrMatrix: input CrsMatrix columns is not a multiple of block size"); - assert((nrows % blockDim_ == 0) && - "BsrMatrix: input CrsMatrix rows is not a multiple of block size"); + if (ncols % blockDim_) { + std::ostringstream os; + os << "BsrMatrix: " << ncols + << " input CrsMatrix columns is not a multiple of block size " + << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (nrows % blockDim_) { + std::ostringstream os; + os << "BsrMatrix: " << nrows + << " input CrsMatrix rows is not a multiple of block size " + << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); } - numCols_ = ncols / blockDim_; - ordinal_type tmp_num_rows = nrows / blockDim_; - - // - // Wrap the raw pointers in unmanaged host Views - // Note that the inputs are in coordinate format. - // So unman_rows and unman_cols have the same type. - // - typename values_type::HostMirror unman_val(val, annz); - typename index_type::HostMirror unman_rows(rows, annz); - typename index_type::HostMirror unman_cols(cols, annz); - - typename row_map_type::non_const_type tmp_row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"), - tmp_num_rows + 1); - auto row_map_host = Kokkos::create_mirror_view(tmp_row_map); - Kokkos::deep_copy(row_map_host, 0); - - if (annz > 0) { - ordinal_type iblock = 0; - std::set set_blocks; - for (size_type ii = 0; ii <= annz; ++ii) { - if ((ii == annz) || ((unman_rows(ii) / blockDim_) > iblock)) { - // Flush the stored entries - row_map_host(iblock + 1) = set_blocks.size(); - if (ii == annz) break; - set_blocks.clear(); - iblock = unman_rows(ii) / blockDim_; - } - ordinal_type tmp_jblock = unman_cols(ii) / blockDim_; - set_blocks.insert(tmp_jblock); + using Coord = std::pair; // row, col + using CoordComp = std::function; // type that can order Coords + using Entry = std::pair; // (row, col), val + using Blocks = std::map, + CoordComp>; // map a block to its non-zeros, sorted + // by row, then col + + numCols_ = ncols / blockDim_; + ordinal_type numRows = nrows / blockDim_; + size_type numBlocks = annz / (blockDim_ * blockDim_); + + // device data + typename row_map_type::non_const_type row_map_device( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_device"), + numRows + 1); + index_type entries_device("entries_device", numBlocks); + Kokkos::resize(values, annz); + + // mirror views on host + auto row_map_host = Kokkos::create_mirror_view(row_map_device); + auto entries_host = Kokkos::create_mirror_view(entries_device); + auto values_host = Kokkos::create_mirror_view(values); + + auto coord_by_row_col = [](const Coord& a, const Coord& b) { + const auto& arow = std::get<0>(a); + const auto& brow = std::get<0>(b); + const auto& acol = std::get<1>(a); + const auto& bcol = std::get<1>(b); + if (arow < brow) { + return true; + } else if (arow > brow) { + return false; + } else { + return acol < bcol; + } + }; + + auto entry_by_row_col = [coord_by_row_col](const Entry& a, const Entry& b) { + return coord_by_row_col(std::get<0>(a), std::get<0>(b)); + }; + + // organize all blocks and their entries + Blocks blocks(coord_by_row_col); + for (size_type i = 0; i < annz; ++i) { + const ordinal_type row = rows[i]; + const ordinal_type col = cols[i]; + const ScalarType val = vals[i]; + const Coord block = Coord(row / blockDim_, col / blockDim_); + const Entry entry(Coord(row, col), val); + + // add entry to the correct block + auto it = blocks.find(block); + if (it == blocks.end()) { + std::vector entries = {entry}; + entries.reserve(blockDim_ * blockDim_); + blocks[block] = std::move(entries); // new block with entry + } else { + it->second.push_back(entry); // add entry to block } } - for (size_type ii = 0; ii < annz; ++ii) - row_map_host(ii + 1) += row_map_host(ii); - - Kokkos::deep_copy(tmp_row_map, row_map_host); - - // Create temporary Views for row_map and entries - // because the StaticCrsGraph ctor requires View inputs - index_type tmp_entries("tmp_entries", row_map_host(tmp_num_rows)); - auto tmp_entries_host = Kokkos::create_mirror_view(tmp_entries); - - Kokkos::resize(values, row_map_host(tmp_num_rows) * blockDim_ * blockDim_); - auto values_host = Kokkos::create_mirror_view(values); - Kokkos::deep_copy(values_host, 0); - - if (annz > 0) { - //--- Fill tmp_entries - ordinal_type cur_block = 0; - std::set set_blocks; - for (size_type ii = 0; ii <= annz; ++ii) { - if ((ii == annz) || ((unman_rows(ii) / blockDim_) > cur_block)) { - // Flush the stored entries - ordinal_type ipos = row_map_host(cur_block); - for (auto jblock : set_blocks) tmp_entries_host(ipos++) = jblock; - if (ii == annz) break; - set_blocks.clear(); - cur_block = unman_rows(ii) / blockDim_; - } - ordinal_type tmp_jblock = unman_cols(ii) / blockDim_; - set_blocks.insert(tmp_jblock); + // write block data out to BSR format + ordinal_type row = 0; // current row we're in + size_t bi = 0; // how many blocks so far + for (auto& kv : blocks) { // iterating through blocks in row/col order + const Coord& block = kv.first; // block's position + auto& entries = kv.second; // non-zeros in the block + + if (OrdinalType(entries.size()) != blockDim_ * blockDim_) { + std::stringstream ss; + ss << "BsrMatrix: block " << block.first << "," << block.second + << " had only " << entries.size() << " non-zeros, expected " + << blockDim_ * blockDim_; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } - //--- Fill numerical values - for (size_type ii = 0; ii < annz; ++ii) { - const auto ilocal = unman_rows(ii) % blockDim_; - const auto jblock = unman_cols(ii) / blockDim_; - const auto jlocal = unman_cols(ii) % blockDim_; - for (auto jj = row_map_host(jblock); jj < row_map_host(jblock + 1); - ++jj) { - if (tmp_entries_host(jj) == jblock) { - const auto shift = - jj * blockDim_ * blockDim_ + ilocal * blockDim_ + jlocal; - values_host(shift) = unman_val(ii); - break; - } - } + + // update row-map if block is in a new row + for (; row < block.first; ++row) { + row_map_host(row + 1) = bi; // `row` ends at bi + } + + // record column of block + entries_host(bi) = block.second; // block's column + + // add contiguous entries of block sorted by row/col + std::sort(entries.begin(), entries.end(), entry_by_row_col); + for (size_type ei = 0; ei < size_type(entries.size()); ++ei) { + values_host(bi * blockDim_ * blockDim_ + ei) = std::get<1>(entries[ei]); } + + // next block + ++bi; + } + // complete row map if last blocks are empty + for (; row < numRows + 1; ++row) { + row_map_host(row) = bi; } - Kokkos::deep_copy(tmp_entries, tmp_entries_host); + // move graph data to the requested device + Kokkos::deep_copy(row_map_device, row_map_host); + Kokkos::deep_copy(entries_device, entries_host); Kokkos::deep_copy(values, values_host); - // Initialize graph using the temp entries and row_map Views - graph = staticcrsgraph_type(tmp_entries, tmp_row_map); + graph = staticcrsgraph_type(entries_device, row_map_device); } - /// \brief Constructor that accepts a row map, column indices, and - /// values. - /// - /// The matrix will store and use the row map, indices, and values - /// directly (by view, not by deep copy). - /// - /// \param label [in] The sparse matrix's label. - /// \param nrows [in] The number of rows. - /// \param ncols [in] The number of columns. - /// \param annz [in] The number of entries. - /// \param vals [in/out] The entries. - /// \param rows [in/out] The row map (containing the offsets to the - /// data in each row). - /// \param cols [in/out] The column indices. - BsrMatrix(const std::string& /*label*/, const OrdinalType nrows, - const OrdinalType ncols, const size_type /*annz*/, - const values_type& vals, const row_map_type& rows, - const index_type& cols, const OrdinalType blockDimIn) +/// \brief Constructor that accepts a row map, column indices, and +/// values. +/// +/// The matrix will store and use the row map, indices, and values +/// directly (by view, not by deep copy). +/// +/// \param label +/// \param nrows [in] The number of rows. +/// \param ncols [in] The number of columns. +/// \param annz [in] Filler for annz. +/// \param vals [in/out] The entries. +/// \param rows [in/out] The row map (containing the offsets to the +/// data in each row). +/// \param cols [in/out] The column indices. +/// \param blockDimIn [in] The block dimensions. +#if defined(DOXY) + BsrMatrix([[maybe_unused]] const std::string& label, +#else + // Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81429. + BsrMatrix(const std::string& label [[maybe_unused]], +#endif + const OrdinalType nrows, const OrdinalType ncols, + [[maybe_unused]] const size_type annz, const values_type& vals, + const row_map_type& rows, const index_type& cols, + const OrdinalType blockDimIn) : graph(cols, rows), values(vals), numCols_(ncols), @@ -641,11 +688,10 @@ class BsrMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param[in] label The sparse matrix's label. - /// \param[in] ncols The number of columns. - /// \param[in] vals The entries. - /// \param[in] graph_ The graph between the blocks. - /// \param[in] blockDimIn The block size. + /// \param ncols [in] The number of columns. + /// \param vals [in] The entries. + /// \param graph_ [in] The graph between the blocks. + /// \param blockDimIn [in] The block dimensions. BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols, const values_type& vals, const staticcrsgraph_type& graph_, const OrdinalType& blockDimIn) @@ -775,17 +821,19 @@ class BsrMatrix { /// \brief Given an array of blocks, sum the values into corresponding /// block in BsrMatrix - /// \param[in] rowi is a block-row index - /// \param[in] ncol is number of blocks referenced in cols[] array - /// \param[in] cols[] are block colidxs within the block-row to be summed - /// into ncol entries - /// \param[in] vals[] array containing 'block' of values + /// \param rowi [in] is a block-row index + /// \param cols[] [in] are block colidxs within the block-row to be summed + /// into ncol entries + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param vals[] [in] array containing 'block' of values /// ncol*block_size*block_size entries /// assume vals block is provided in 'LayoutRight' or 'Row Major' /// format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened /// 1d array as [a b c d] Assume that each block is stored contiguously /// in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i /// in [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] KOKKOS_INLINE_FUNCTION OrdinalType sumIntoValues(const OrdinalType rowi, const OrdinalType cols[], const OrdinalType ncol, const ScalarType vals[], @@ -797,17 +845,20 @@ class BsrMatrix { /// \brief Given an array of blocks, replace the values of corresponding /// blocks in BsrMatrix - /// \param[in] rowi is a block-row index - /// \param[in] ncol is number of blocks referenced in cols[] array - /// \param[in] cols[] are block colidxs within the block-row to be summed + /// \param rowi [in] is a block-row index + /// \param cols[] [in] are block colidxs within the block-row to be summed /// into ncol entries + /// \param ncol [in] is number of blocks referenced in cols[] array /// \param vals[] [in] array containing 'block' of values - // ncol*block_size*block_size entries - // assume vals block is provided in 'LayoutRight' or 'Row Major' - // format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened - // 1d array as [a b c d] Assume that each block is stored contiguously - // in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i in - // [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row + /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided + /// as flattened 1d array as [a b c d] Assume that each block is + /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c + /// d e f g h] If so, then i in [0, ncols) for cols[] maps to + /// i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] KOKKOS_INLINE_FUNCTION OrdinalType replaceValues(const OrdinalType rowi, const OrdinalType cols[], const OrdinalType ncol, const ScalarType vals[], @@ -942,17 +993,21 @@ class BsrMatrix { /// \brief Given an array of blocks, operate on the values of corresponding /// blocks in BsrMatrix - /// \param[in] rowi is a block-row index - /// \param[in] ncol is number of blocks referenced in cols[] array - /// \param[in] cols[] are block colidxs within the block-row to be op-ed + /// \param op + /// \param rowi [in] is a block-row index + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param cols[] [in] are block colidxs within the block-row to be op-ed /// into ncol entries /// \param vals[] [in] array containing 'block' of values - // ncol*block_size*block_size entries - // assume vals block is provided in 'LayoutRight' or 'Row Major' - // format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened - // 1d array as [a b c d] Assume that each block is stored contiguously - // in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i in - // [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row + /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided + /// as flattened 1d array as [a b c d] Assume that each block is + /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c + /// d e f g h] If so, then i in [0, ncols) for cols[] maps to + /// i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] KOKKOS_INLINE_FUNCTION OrdinalType operateValues(const BsrMatrix::valueOperation op, const OrdinalType rowi, const OrdinalType cols[], @@ -971,7 +1026,7 @@ class BsrMatrix { // + 1] (not global offset) colidx_ and values_ are already offset to the // beginning of blockrow rowi auto blk_offset = row_view.findRelBlockOffset(cols[i], is_sorted); - if (blk_offset != Kokkos::Details::ArithTraits::max()) { + if (blk_offset != Kokkos::ArithTraits::max()) { ordinal_type offset_into_vals = i * block_size * block_size; // stride == 1 assumed between elements diff --git a/sparse/src/KokkosSparse_CcsMatrix.hpp b/sparse/src/KokkosSparse_CcsMatrix.hpp index 5159f68d87..306b740a00 100644 --- a/sparse/src/KokkosSparse_CcsMatrix.hpp +++ b/sparse/src/KokkosSparse_CcsMatrix.hpp @@ -194,7 +194,6 @@ class CcsMatrix { /// The matrix will store and use the column map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. diff --git a/sparse/src/KokkosSparse_CooMatrix.hpp b/sparse/src/KokkosSparse_CooMatrix.hpp new file mode 100644 index 0000000000..30a41ba11c --- /dev/null +++ b/sparse/src/KokkosSparse_CooMatrix.hpp @@ -0,0 +1,163 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosSparse_CooMatrix.hpp +/// \brief Local sparse matrix interface +/// +/// This file provides KokkosSparse::CooMatrix. This implements a +/// local (no MPI) sparse matrix stored in coordinate ("Coo") format +/// which is also known as ivj or triplet format. + +#ifndef KOKKOS_SPARSE_COOMATRIX_HPP_ +#define KOKKOS_SPARSE_COOMATRIX_HPP_ + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_Error.hpp" +#include +#include + +namespace KokkosSparse { +/// \class CooMatrix +/// +/// \brief Coordinate format implementation of a sparse matrix. +/// +/// \tparam ScalarType The type of scalar entries in the sparse matrix. +/// \tparam OrdinalType The type of index entries in the sparse matrix. +/// \tparam Device The Kokkos Device type. +/// \tparam MemoryTraits Traits describing how Kokkos manages and +/// accesses data. The default parameter suffices for most users. +/// "Coo" stands for "coordinate format". +template ::size_type> +class CooMatrix { + public: + //! Type of each value in the matrix + using scalar_type = ScalarType; + //! Type of each value in the const matrix + using const_scalar_type = const std::remove_const_t; + //! Non constant scalar type + using non_const_scalar_type = std::remove_const_t; + //! Type of each index in the matrix + using ordinal_type = OrdinalType; + //! Type of each value in the const matrix + using const_ordinal_type = const std::remove_const_t; + //! Non constant ordinal type + using non_const_ordinal_type = std::remove_const_t; + //! Type of each row index in the matrix + using row_type = ordinal_type; + //! Type of each column index in the matrix + using column_type = ordinal_type; + //! Type of the Kokkos::Device + using device_type = Device; + //! Type of the Kokkos::Device::execution_space + using execution_space = typename device_type::execution_space; + //! Type of the Kokkos::Device::memory_space + using memory_space = typename device_type::memory_space; + //! Type of the Kokkos::MemoryTraits + using memory_traits = MemoryTraits; + //! Type of all integral class members + using size_type = SizeType; + + static_assert(std::is_integral_v, + "OrdinalType must be an integral."); + + //! The type of the row index view in the matrix + using row_view = + Kokkos::View; + //! The type of the column index view in the matrix + using column_view = Kokkos::View; + //! The type of the scalar values view in the matrix + using scalar_view = Kokkos::View; + //! The type of a constant CooMatrix + using const_type = CooMatrix; + + private: + size_type m_num_rows, m_num_cols; + row_view m_row; + column_view m_col; + scalar_view m_data; + + public: + /// \brief Default constructor; constructs an empty sparse matrix. + KOKKOS_INLINE_FUNCTION + CooMatrix() : m_num_rows(0), m_num_cols(0) {} + + // clang-format off + /// \brief Constructor that accepts a column indicies view, row indices view, and + /// values view. + /// + /// The matrix will store and use the column indices, rows indices, and values + /// directly (by view, not by deep copy). + /// + /// \param nrows [in] The number of rows. + /// \param ncols [in] The number of columns. + /// \param row_in [in] The row indexes. + /// \param col_in [in] The column indexes. + /// \param data_in [in] The values. + // clang-format on + CooMatrix(size_type nrows, size_type ncols, row_view row_in, + column_view col_in, scalar_view data_in) + : m_num_rows(nrows), + m_num_cols(ncols), + m_row(row_in), + m_col(col_in), + m_data(data_in) { + if (m_data.extent(0) != m_row.extent(0) || + m_row.extent(0) != m_col.extent(0)) { + std::ostringstream os; + os << "data.extent(0): " << m_data.extent(0) << " != " + << "row.extent(0): " << m_row.extent(0) << " != " + << "col.extent(0): " << m_col.extent(0) << "."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + } + + //! The number of columns in the sparse matrix. + KOKKOS_INLINE_FUNCTION size_type numCols() const { return m_num_cols; } + + //! The number of rows in the sparse matrix. + KOKKOS_INLINE_FUNCTION size_type numRows() const { return m_num_rows; } + + //! The number of stored entries in the sparse matrix, including zeros. + KOKKOS_INLINE_FUNCTION size_type nnz() const { return m_data.extent(0); } + + //! The row indexes of the matrix + KOKKOS_INLINE_FUNCTION row_view row() const { return m_row; } + + //! The column indexes of the matrix + KOKKOS_INLINE_FUNCTION column_view col() const { return m_col; } + + //! The scalar values of the matrix + KOKKOS_INLINE_FUNCTION scalar_view data() const { return m_data; } +}; + +/// \class is_coo_matrix +/// \brief is_coo_matrix::value is true if T is a CooMatrix<...>, false +/// otherwise +template +struct is_coo_matrix : public std::false_type {}; +template +struct is_coo_matrix> : public std::true_type {}; +template +struct is_coo_matrix> : public std::true_type {}; + +} // namespace KokkosSparse +#endif diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index e13f332ff1..be3ac80343 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -159,29 +159,28 @@ struct SparseRowView { public: /// \brief Constructor /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in /// each of the above arrays. - /// \param count [in] Number of entries in the row. + /// \param count [in] Number of entries in the row. KOKKOS_INLINE_FUNCTION SparseRowView(value_type* const values, ordinal_type* const colidx__, const ordinal_type& stride, const ordinal_type& count) : values_(values), colidx_(colidx__), stride_(stride), length(count) {} /// \brief Constructor with offset into \c colidx array - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. - /// \param idx [in] Start offset into \c colidx array - /// /// \tparam OffsetType The type of \c idx (see above). Must be a /// built-in integer type. This may differ from ordinal_type. /// For example, the matrix may have dimensions that fit in int, /// but a number of entries that does not fit in int. + /// + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array template KOKKOS_INLINE_FUNCTION SparseRowView( const typename MatrixType::values_type& values, @@ -259,11 +258,11 @@ struct SparseRowViewConst { public: /// \brief Constructor /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. KOKKOS_INLINE_FUNCTION SparseRowViewConst(value_type* const values, ordinal_type* const colidx__, const ordinal_type& stride, const ordinal_type& count) @@ -271,17 +270,16 @@ struct SparseRowViewConst { /// \brief Constructor with offset into \c colidx array /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. - /// \param idx [in] Start offset into \c colidx array - /// /// \tparam OffsetType The type of \c idx (see above). Must be a /// built-in integer type. This may differ from ordinal_type. /// For example, the matrix may have dimensions that fit in int, /// but a number of entries that does not fit in int. + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array template KOKKOS_INLINE_FUNCTION SparseRowViewConst( const typename MatrixType::values_type& values, @@ -506,8 +504,9 @@ class CrsMatrix { /// The matrix will store and use the row map, indices /// (by view, not by deep copy) and allocate the values view. /// - /// \param label [in] The sparse matrix's label. - /// \param ncols [in] The number of columns. + /// \param label [in] The sparse matrix's label. + /// \param graph_ [in] The graph for storing the rowmap and col ids. + /// \param ncols [in] The number of columns. template CrsMatrix(const std::string& label, @@ -523,14 +522,9 @@ class CrsMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. - /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. - /// \param annz [in] The number of entries. /// \param vals [in/out] The entries. - /// \param rows [in/out] The row map (containing the offsets to the - /// data in each row). - /// \param cols [in/out] The column indices. + /// \param graph_ The graph for storing the rowmap and col ids. template CrsMatrix(const std::string&, const OrdinalType& ncols, @@ -550,7 +544,6 @@ class CrsMatrix { /// This constructor is mainly useful for benchmarking or for /// reading the sparse matrix's data from a file. /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. @@ -608,7 +601,6 @@ class CrsMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index 77934b4f3e..4704a8724c 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -177,8 +177,7 @@ void kk_diagonally_dominant_sparseMatrix_generate( entriesInRow.insert(pos); colInd[k] = pos; values[k] = 100.0 * rand() / RAND_MAX - 50.0; - total_values += - Kokkos::Details::ArithTraits::abs(values[k]); + total_values += Kokkos::ArithTraits::abs(values[k]); break; } } @@ -1180,33 +1179,16 @@ crsGraph_t read_kokkos_crst_graph(const char *filename_) { row_map_view_t rowmap_view("rowmap_view", nv + 1); cols_view_t columns_view("colsmap_view", nnzA); - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - - for (lno_t i = 0; i <= nv; ++i) { - hr(i) = xadj[i]; - } + typename row_map_view_t::HostMirror hr(xadj, nv + 1); + typename cols_view_t::HostMirror hc(adj, nnzA); + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); - for (size_type i = 0; i < nnzA; ++i) { - hc(i) = adj[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - } - - lno_t ncols = 0; - KokkosKernels::Impl::kk_view_reduce_max( - nnzA, columns_view, ncols); - ncols += 1; - - crsGraph_t static_graph(columns_view, rowmap_view, ncols); delete[] xadj; delete[] adj; delete[] values; + + crsGraph_t static_graph(columns_view, rowmap_view); return static_graph; } diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 3cef1b6315..1e2e408063 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -13,7 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -/// @file KokkosKernels_MatrixPrec.hpp + +/// @file KokkosSparse_MatrixPrec.hpp #ifndef KK_MATRIX_PREC_HPP #define KK_MATRIX_PREC_HPP @@ -27,6 +28,7 @@ namespace KokkosSparse { namespace Experimental { +/// @file KokkosSparse_MatrixPrec.hpp /// \class MatrixPrec /// \brief This is a simple class to use if one /// already has a matrix representation of their diff --git a/sparse/src/KokkosSparse_OrdinalTraits.hpp b/sparse/src/KokkosSparse_OrdinalTraits.hpp index 8bf64a014f..6d76460939 100644 --- a/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_SPARSE_ORDINALTRAITS_HPP_ #define KOKKOS_SPARSE_ORDINALTRAITS_HPP_ -/// \file Kokkos_Sparse_OrdinalTraits.hpp +/// \file KokkosSparse_OrdinalTraits.hpp /// \brief Declaration and definition of KokkosSparse::OrdinalTraits, /// a traits class for "invalid" (flag) values of integer types that /// KokkosKernels uses as local ordinals or global ordinals. diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 8fdd9398b2..99ce1a2f1a 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -13,8 +13,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -/// @file KokkosKernels_Preconditioner.hpp -// + +/// @file KokkosSparse_Preconditioner.hpp + #ifndef KK_PREC_HPP #define KK_PREC_HPP diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index d04ddb5a30..31b835d358 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -33,43 +33,40 @@ template +void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, + const rowmap_t& rowmap, const entries_t& entries, + const values_t& values); + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. template void sort_bsr_matrix(const bsrMat_t& A); +// Sort a BRS matrix on the given execution space instance: within each row, +// sort entries ascending by column and permute the values accordingly. +template +void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, + const bsrMat_t& A); + // ---------------------------------- // CRS matrix/graph sorting utilities // ---------------------------------- // The sort_crs* functions sort the adjacent column list for each row into -// ascending order. - -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values); - -template -void sort_crs_matrix(const crsMat_t& A); - -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); - -template -void sort_crs_graph(const crsGraph_t& G); +// ascending order. Each version either takes an execution space instance as a +// parameter, or uses the default instance. // sort_and_merge_matrix produces a new matrix which is equivalent to A but is // sorted and has no duplicated entries: each (i, j) is unique. Values for -// duplicated entries are summed. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A); - -template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G); - -template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out); +// duplicated entries are summed. Each version either takes an execution space +// instance as a parameter, or uses the default instance. If there are no +// duplicated entries in A, A is sorted and returned (instead of a newly +// allocated matrix). namespace Impl { @@ -216,8 +213,8 @@ struct MatrixMergedEntriesFunctor { using scalar_t = typename values_t::non_const_value_type; // Precondition: entries are sorted within each row - MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, - const values_t& values_, + MatrixMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, + const entries_t& entries_, const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_) @@ -257,7 +254,7 @@ struct MatrixMergedEntriesFunctor { mergedEntries(insertPos) = accumCol; } - rowmap_t rowmap; + typename rowmap_t::const_type rowmap; entries_t entries; values_t values; rowmap_t mergedRowmap; @@ -271,7 +268,8 @@ struct GraphMergedEntriesFunctor { using lno_t = typename entries_t::non_const_value_type; // Precondition: entries are sorted within each row - GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, + GraphMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, + const entries_t& entries_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) : rowmap(rowmap_), @@ -301,7 +299,7 @@ struct GraphMergedEntriesFunctor { mergedEntries(insertPos) = accumCol; } - rowmap_t rowmap; + typename rowmap_t::const_type rowmap; entries_t entries; rowmap_t mergedRowmap; entries_t mergedEntries; @@ -360,8 +358,27 @@ struct sort_bsr_functor { // At the same time, permute the values. template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { +void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_matrix: rowmap_t is not accessible from the given execution " + "space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_matrix: entries_t is not accessible from the given execution " + "space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_matrix: values_t is not accessible from the given execution " + "space"); + static_assert(!std::is_const_v, + "sort_crs_matrix: entries_t must not be const-valued"); + static_assert(!std::is_const_v, + "sort_crs_matrix: value_t must not be const-valued"); using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); @@ -371,7 +388,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, funct(useRadix, rowmap, entries, values); if (useRadix) { Kokkos::parallel_for("sort_crs_matrix", - Kokkos::RangePolicy(0, numRows), + Kokkos::RangePolicy(exec, 0, numRows), funct); } else { // Try to get teamsize to be largest power of 2 not greater than avg entries @@ -383,33 +400,47 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, while (idealTeamSize < avgDeg / 2) { idealTeamSize *= 2; } - team_pol temp(numRows, 1); + team_pol temp(exec, numRows, 1); lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); + Kokkos::parallel_for("sort_crs_matrix", team_pol(exec, numRows, teamSize), + funct); } } +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + sort_crs_matrix(execution_space(), rowmap, entries, values); +} + +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, + values); +} + +template +void sort_crs_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A) { + sort_crs_matrix(exec, A.graph.row_map, A.graph.entries, A.values); +} + template void sort_crs_matrix(const crsMat_t& A) { - // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using exec_space = typename crsMat_t::execution_space; - // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the - // entries and CrsMatrix values are non-const (so sorting them directly - // is allowed) - sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); + sort_crs_matrix(typename crsMat_t::execution_space(), A.graph.row_map, + A.graph.entries, A.values); } // Sort a BRS matrix: within each row, sort entries ascending by column and // permute the values accordingly. template -void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values) { +void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, + const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { // TODO: this is O(N^2) mock for debugging - do regular implementation based // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? @@ -421,28 +452,54 @@ void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, Impl::sort_bsr_functor bsr_sorter( rowmap, entries, values, blocksize); Kokkos::parallel_for("sort_bsr_matrix", - Kokkos::RangePolicy(0, numRows), + Kokkos::RangePolicy(exec, 0, numRows), bsr_sorter); } +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + sort_bsr_matrix(execution_space(), blockdim, rowmap, entries, values); +} + // Sort a BSR matrix (like CRS but single values are replaced with contignous // blocks) template -void sort_bsr_matrix(const bsrMat_t& A) { +void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, + const bsrMat_t& A) { // NOTE: unlike rowmap, entries and values are non-const, so we can sort them // directly sort_bsr_matrix( - A.blockDim(), A.graph.row_map, A.graph.entries, A.values); + exec, A.blockDim(), A.graph.row_map, A.graph.entries, A.values); +} + +template +void sort_bsr_matrix(const bsrMat_t& A) { + sort_bsr_matrix(typename bsrMat_t::execution_space(), A); } // Sort a CRS graph: within each row, sort entries ascending by column. template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { +void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, + const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_graph: rowmap_t is not accessible from the given execution " + "space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_graph: entries_t is not accessible from the given execution " + "space"); + static_assert(!std::is_const_v, + "sort_crs_graph: entries_t must not be const-valued"); bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if (numRows == 0) return; @@ -450,7 +507,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { useRadix, rowmap, entries); if (useRadix) { Kokkos::parallel_for("sort_crs_graph", - Kokkos::RangePolicy(0, numRows), + Kokkos::RangePolicy(exec, 0, numRows), funct); } else { // Try to get teamsize to be largest power of 2 less than or equal to @@ -463,102 +520,268 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { while (idealTeamSize < avgDeg / 2) { idealTeamSize *= 2; } - team_pol temp(numRows, 1); + team_pol temp(exec, numRows, 1); lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); + Kokkos::parallel_for("sort_crs_graph", team_pol(exec, numRows, teamSize), + funct); + } +} + +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { + sort_crs_graph(execution_space(), rowmap, entries); +} + +// This overload covers 2 cases, while allowing all template args to be deduced: +// - sort_crs_graph(exec, G) +// - sort_crs_graph(rowmap, entries) +template +void sort_crs_graph(const Arg1& a1, const Arg2& a2) { + if constexpr (Kokkos::is_execution_space_v) { + // a1 is an exec instance, a2 is a graph + sort_crs_graph(a1, a2.row_map, a2.entries); + } else if constexpr (Kokkos::is_view_v) { + // a1 is rowmap, a2 is entries + sort_crs_graph(typename Arg2::execution_space(), a1, a2); + } else { + static_assert(Arg1::doesnthavethisthing, + "sort_crs_graph(arg1, arg2): expect either (exec, G) or " + "(rowmap, entries)"); } } template void sort_crs_graph(const crsGraph_t& G) { - static_assert( - !std::is_const::value, - "sort_crs_graph requires StaticCrsGraph entries to be non-const."); - sort_crs_graph(G.row_map, G.entries); + sort_crs_graph(typename crsGraph_t::execution_space(), G); } -// Sort the rows of matrix, and merge duplicate entries. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - using c_rowmap_t = typename crsMat_t::row_map_type; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using size_type = typename rowmap_t::non_const_value_type; - using exec_space = typename crsMat_t::execution_space; - using range_t = Kokkos::RangePolicy; - sort_crs_matrix(A); +template +void sort_and_merge_matrix(const exec_space& exec, + const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out) { + using nc_rowmap_t = typename rowmap_t::non_const_type; + using size_type = typename nc_rowmap_t::value_type; + using ordinal_t = typename entries_t::value_type; + using range_t = Kokkos::RangePolicy; + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_matrix: rowmap_t is not accessible from the given " + "execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_matrix: entries_t is not accessible from the given " + "execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_matrix: values_t is not accessible from the given " + "execution space"); + static_assert(!std::is_const_v, + "sort_and_merge_matrix: entries_t must not be const-valued"); + static_assert(!std::is_const_v, + "sort_and_merge_matrix: value_t must not be const-valued"); + + ordinal_t numRows = + rowmap_in.extent(0) ? ordinal_t(rowmap_in.extent(0) - 1) : ordinal_t(0); + size_type nnz = entries_in.extent(0); + + if (numRows == 0) { + rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", + rowmap_in.extent(0)); + entries_out = entries_t(); + values_out = values_t(); + return; + } + + sort_crs_matrix(exec, rowmap_in, entries_in, values_in); + // Count entries per row into a new rowmap, in terms of merges that can be // done - rowmap_t mergedRowmap( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - A.numRows() + 1); + nc_rowmap_t nc_rowmap_out( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged rowmap"), + numRows + 1); size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, A.numRows()), - Impl::MergedRowmapFunctor( - mergedRowmap, A.graph.row_map, A.graph.entries), + Kokkos::parallel_reduce(range_t(exec, 0, numRows), + Impl::MergedRowmapFunctor( + nc_rowmap_out, rowmap_in, entries_in), numCompressedEntries); + if (nnz == numCompressedEntries) { + // No merges to do, so just return A. Save the time of allocating and + // filling a copy. + if constexpr (std::is_const_v) { + rowmap_out = rowmap_in; + } else { + // rowmap_t is non-const, so we can't directly assign rowmap_in to + // rowmap_out. Forced to deep copy it to maintain const-correctness. + Kokkos::deep_copy(exec, nc_rowmap_out, rowmap_in); + rowmap_out = nc_rowmap_out; + } + entries_out = entries_in; + values_out = values_in; + return; + } // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - A.numRows() + 1, mergedRowmap); - entries_t mergedEntries("SortedMerged entries", numCompressedEntries); - values_t mergedValues("SortedMerged values", numCompressedEntries); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + exec, numRows + 1, nc_rowmap_out); + rowmap_out = nc_rowmap_out; + entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged entries"), + numCompressedEntries); + values_out = values_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged values"), + numCompressedEntries); // Compute merged entries and values Kokkos::parallel_for( - range_t(0, A.numRows()), - Impl::MatrixMergedEntriesFunctor( - A.graph.row_map, A.graph.entries, A.values, mergedRowmap, - mergedEntries, mergedValues)); - // Finally, construct the new compressed matrix + range_t(exec, 0, numRows), + Impl::MatrixMergedEntriesFunctor( + rowmap_in, entries_in, values_in, rowmap_out, entries_out, + values_out)); +} + +// Sort the rows of matrix, and merge duplicate entries. +template +crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A) { + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + + rowmap_t rowmap_out; + entries_t entries_out; + values_t values_out; + + sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, + rowmap_out, entries_out, values_out); + return crsMat_t("SortedMerged", A.numRows(), A.numCols(), - numCompressedEntries, mergedValues, mergedRowmap, - mergedEntries); + values_out.extent(0), values_out, rowmap_out, entries_out); +} + +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + return sort_and_merge_matrix(typename crsMat_t::execution_space(), A); +} + +template +void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out) { + sort_and_merge_matrix(exec_space(), rowmap_in, entries_in, values_in, + rowmap_out, entries_out, values_out); +} + +template +void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out) { + sort_and_merge_matrix(typename entries_t::execution_space(), rowmap_in, + entries_in, values_in, rowmap_out, entries_out, + values_out); } template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, +void sort_and_merge_graph(const exec_space& exec, + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using range_t = Kokkos::RangePolicy; - using const_rowmap_t = typename rowmap_t::const_type; - lno_t numRows = rowmap_in.extent(0); - if (numRows <= 1) { - // Matrix has zero rows - rowmap_out = rowmap_t(); + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::value_type; + using range_t = Kokkos::RangePolicy; + using nc_rowmap_t = typename rowmap_t::non_const_type; + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_graph: rowmap_t is not accessible from the given " + "execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_graph: entries_t is not accessible from the given " + "execution space"); + static_assert(!std::is_const_v, + "sort_and_merge_graph: entries_t must not be const-valued"); + + lno_t numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0; + if (numRows == 0) { + rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", + rowmap_in.extent(0)); entries_out = entries_t(); return; } - numRows--; // Sort in place - sort_crs_graph(rowmap_in, entries_in); + sort_crs_graph(exec, rowmap_in, entries_in); // Count entries per row into a new rowmap, in terms of merges that can be // done - rowmap_out = rowmap_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), + nc_rowmap_t nc_rowmap_out( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged rowmap"), numRows + 1); size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, numRows), + Kokkos::parallel_reduce(range_t(exec, 0, numRows), Impl::MergedRowmapFunctor( - rowmap_out, rowmap_in, entries_in), + nc_rowmap_out, rowmap_in, entries_in), + numCompressedEntries); + if (entries_in.extent(0) == size_t(numCompressedEntries)) { + // No merges to perform, so the output rowmap is unchanged and we can just + // return the now-sorted entries_in. + if constexpr (std::is_const_v) { + rowmap_out = rowmap_in; + } else { + // rowmap_t is non-const, so we can't directly assign rowmap_in to + // rowmap_out. Forced to deep copy it to maintain const-correctness. + Kokkos::deep_copy(exec, nc_rowmap_out, rowmap_in); + rowmap_out = nc_rowmap_out; + } + entries_out = entries_in; + return; + } + // Prefix sum to get rowmap. + // In the case where the output rowmap is the same as the input, we could just + // assign "rowmap_out = rowmap_in" except that would break const-correctness. + // Can skip filling the entries, however. + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + exec, numRows + 1, nc_rowmap_out); + rowmap_out = nc_rowmap_out; + entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged entries"), numCompressedEntries); - // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - numRows + 1, rowmap_out); - entries_out = entries_t("SortedMerged entries", numCompressedEntries); // Compute merged entries and values - Kokkos::parallel_for( - range_t(0, numRows), - Impl::GraphMergedEntriesFunctor( - rowmap_in, entries_in, rowmap_out, entries_out)); + Kokkos::parallel_for(range_t(exec, 0, numRows), + Impl::GraphMergedEntriesFunctor( + rowmap_in, entries_in, rowmap_out, entries_out)); +} + +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + return sort_and_merge_graph(exec_space(), rowmap_in, entries_in, rowmap_out, + entries_out); +} + +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, + entries_in, rowmap_out, entries_out); } template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { +crsGraph_t sort_and_merge_graph( + const typename crsGraph_t::execution_space& exec, const crsGraph_t& G) { using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; using entries_t = typename crsGraph_t::entries_type; static_assert( @@ -566,12 +789,15 @@ crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); rowmap_t mergedRowmap; entries_t mergedEntries; - sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, - mergedEntries); + sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, mergedEntries); return crsGraph_t(mergedEntries, mergedRowmap); } +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { + return sort_and_merge_graph(typename crsGraph_t::execution_space(), G); +} + } // namespace KokkosSparse namespace KokkosKernels { @@ -651,44 +877,6 @@ template entries_out); } -// For backward compatibility: keep the public interface accessible in -// KokkosKernels::Impl:: -namespace Impl { -template -[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, - const entries_t& entries) { - KokkosKernels::sort_crs_graph(rowmap, - entries); -} - -template -[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, - const entries_t& entries, - const values_t& values) { - KokkosKernels::sort_crs_matrix(rowmap, entries, values); -} - -template -[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { - KokkosKernels::sort_crs_matrix(A); -} - -template -[[deprecated]] void sort_and_merge_graph( - const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - KokkosKernels::sort_and_merge_graph( - rowmap_in, entries_in, rowmap_out, entries_out); -} - -template -[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - return KokkosKernels::sort_and_merge_matrix(A); -} - -} // namespace Impl } // namespace KokkosKernels #endif // _KOKKOSSPARSE_SORTCRS_HPP diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 2b72fa01cc..4039b6f5a7 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -725,26 +725,27 @@ struct Reverse_Map_Functor { } }; -/** - * \brief Utility function to obtain a reverse map given a map. - * Input is a map with the number of elements within the map. - * forward_map[c] = i, where c is a forward element and forward_map has a size - * of num_forward_elements. i is the value that c is mapped in the forward map, - * and the range of that is num_reverse_elements. Output is the reverse_map_xadj - * and reverse_map_adj such that, all c, forward_map[c] = i, will appear in - * reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1]) \param: - * num_forward_elements: the number of elements in the forward map, the size of - * the forward map. \param: num_reverse_elements: the number of elements that - * forward map is mapped to. It is the value of max i. \param: forward_map: - * input forward_map, where forward_map[c] = i. \param: reverse_map_xadj: - * reverse map xadj, that is it will hold the beginning and end indices on - * reverse_map_adj such that all values mapped to i will be [ - * reverse_map_xadj[i]: reverse_map_xadj[i+1]) its size will be - * num_reverse_elements + 1. NO NEED TO INITIALIZE. \param: reverse_map_adj: - * reverse map adj, holds the values of reverse maps. Its size is - * num_forward_elements. - * - */ +/// \brief Utility function to obtain a reverse map given a map. +/// Input is a map with the number of elements within the map. +/// forward_map[c] = i, where c is a forward element and forward_map has a size +/// of num_forward_elements. i is the value that c is mapped in the forward map, +/// and the range of that is num_reverse_elements. Output is the +/// reverse_map_xadj and reverse_map_adj such that, all c, forward_map[c] = i, +/// will appear in reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1]) + +/// \param num_forward_elements the number of elements in the forward map, +/// the size of the forward map. +/// \param num_reverse_elements the number of elements that +/// forward map is mapped to. It is the value of max i. +/// \param forward_map input forward_map, where forward_map[c] = i. +/// \param reverse_map_xadj +/// reverse map xadj, that is it will hold the beginning and +/// end indices on reverse_map_adj such that all values mapped +/// to i will be [reverse_map_xadj[i]: reverse_map_xadj[i+1]) +// its size will be num_reverse_elements + 1. +/// NO NEED TO INITIALIZE. +/// \param reverse_map_adj reverse map adj, holds the values of reverse +/// maps. Its size is num_forward_elements. template void kk_create_reverse_map( @@ -906,25 +907,13 @@ struct ColorChecker { } }; -/** - * \brief given a graph and a coloring function returns true or false if - distance-1 coloring is valid or not. - * \param num_rows: num rows in input graph - * \param num_cols: num cols in input graph - * \param xadj: row pointers of the input graph - * \param adj: column indices of the input graph - * \param t_xadj: output, the row indices of the output graph. MUST BE - INITIALIZED WITH ZEROES. - - * \param vector_size: suggested vector size, optional. if -1, kernel will - decide. - * \param suggested_team_size: suggested team size, optional. if -1, kernel will - decide. - * \param team_work_chunk_size: suggested work size of a team, optional. if -1, - kernel will decide. - * \param use_dynamic_scheduling: whether to use dynamic scheduling. Default is - true. - */ +/// \brief given a graph and a coloring function returns true or false if +/// distance-1 coloring is valid or not. +/// +/// \param num_rows num rows in input graph +/// \param xadj row pointers of the input graph +/// \param adj column indices of the input graphw +/// \param v_colors The colors at each vertex in the graph. template inline size_t kk_is_d1_coloring_valid( diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index dd479610ca..e263dfd0fa 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -17,7 +17,10 @@ #ifndef _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP #define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP +#include + #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +#include #include "rocsparse/rocsparse.h" namespace KokkosSparse { @@ -149,21 +152,32 @@ inline rocsparse_datatype rocsparse_compute_type>() { return rocsparse_datatype_f64_c; } -template -struct kokkos_to_rocsparse_type { - using type = Scalar; +template +struct kokkos_to_rocsparse_type; + +// for floats, rocsparse uses c++ builtin types +template +struct kokkos_to_rocsparse_type>> { + using type = T; }; +// translate complex float template <> struct kokkos_to_rocsparse_type> { using type = rocsparse_float_complex; }; +// translate complex double template <> struct kokkos_to_rocsparse_type> { using type = rocsparse_double_complex; }; +// e.g. 5.4 -> 50400 +#define KOKKOSSPARSE_IMPL_ROCM_VERSION \ + ROCM_VERSION_MAJOR * 10000 + ROCM_VERSION_MINOR * 100 + ROCM_VERSION_PATCH + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_ccs2crs.hpp b/sparse/src/KokkosSparse_ccs2crs.hpp index 50fec77411..9b4bae2134 100644 --- a/sparse/src/KokkosSparse_ccs2crs.hpp +++ b/sparse/src/KokkosSparse_ccs2crs.hpp @@ -115,7 +115,7 @@ auto ccs2crs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, /// /// \tparam ScalarType The ccsMatrix::scalar_type /// \tparam OrdinalType The ccsMatrix::ordinal_type -/// \tparam Device The ccsMatrix::device_type +/// \tparam DeviceType The ccsMatrix::device_type /// \tparam MemoryTraits The ccsMatrix::memory_traits /// \tparam SizeType The ccsMatrix::size_type /// \param ccsMatrix The KokkosSparse::CcsMatrix. diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp new file mode 100644 index 0000000000..45e54ce474 --- /dev/null +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSSPARSE_COO2CRS_HPP +#define _KOKKOSSPARSE_COO2CRS_HPP +// The unorderedmap changes necessary for this to work +// have not made it into Kokkos 4.0.00 pr 4.0.01 will +// need to see if it happens in 4.1.00 to have a final +// version check here. +#if KOKKOS_VERSION >= 40099 || defined(DOXY) + +#include "KokkosSparse_CooMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_Utils.hpp" + +#include "KokkosSparse_coo2crs_impl.hpp" + +namespace KokkosSparse { +// clang-format off +/// +/// \brief Blocking function that converts a CooMatrix into a CrsMatrix. Values are summed. +/// \tparam DimType the dimension type +/// \tparam RowViewType The row array view type +/// \tparam ColViewType The column array view type +/// \tparam DataViewType The data array view type +/// \param m the number of rows +/// \param n the number of columns +/// \param row the array of row ids +/// \param col the array of col ids +/// \param data the array of data +/// \return A KokkosSparse::CrsMatrix. +// clang-format on +template +auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, + DataViewType data) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "RowViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "CalViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "DataViewType must be a Kokkos::View."); + static_assert(static_cast(RowViewType::rank) == 1, + "RowViewType must have rank 1."); + static_assert(static_cast(ColViewType::rank) == 1, + "ColViewType must have rank 1."); + static_assert(static_cast(DataViewType::rank) == 1, + "DataViewType must have rank 1."); +#endif + + static_assert(std::is_integral::value, + "RowViewType::value_type must be an integral."); + static_assert(std::is_integral::value, + "ColViewType::value_type must be an integral."); + + if (row.extent(0) != col.extent(0) || row.extent(0) != data.extent(0)) + Kokkos::abort("row.extent(0) = col.extent(0) = data.extent(0) required."); + + if constexpr (std::is_signed_v) { + if (m < 0 || n < 0) Kokkos::abort("m >= 0 and n >= 0 required."); + } + + using Coo2crsType = + Impl::Coo2Crs; + Coo2crsType Coo2Crs(m, n, row, col, data); + return Coo2Crs.get_crsMat(); +} + +// clang-format off +/// +/// \brief Blocking function that converts a CooMatrix into a CrsMatrix. Values are summed. +/// \tparam ScalarType The `KokkosSparse::CooMatrix::scalar_type` +/// \tparam OrdinalType The KokkosSparse::CooMatrix::ordinal_type +/// \tparam DeviceType The KokkosSparse::CooMatrix::device_type +/// \tparam MemoryTraits The KokkosSparse::CooMatrix::memory_traits +/// \tparam SizeType The KokkosSparse::CooMatrix::size_type +/// \param cooMatrix The sparse matrix stored in coordinate ("Coo") format. +/// \return A KokkosSparse::CrsMatrix. +// clang-format on +template +auto coo2crs(KokkosSparse::CooMatrix &cooMatrix) { + return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row, + cooMatrix.col, cooMatrix.data); +} +} // namespace KokkosSparse +#endif // KOKKOS_VERSION >= 40099 || defined(DOXY) +#endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/sparse/src/KokkosSparse_crs2ccs.hpp b/sparse/src/KokkosSparse_crs2ccs.hpp index 9def73b5db..c9265842cb 100644 --- a/sparse/src/KokkosSparse_crs2ccs.hpp +++ b/sparse/src/KokkosSparse_crs2ccs.hpp @@ -113,7 +113,7 @@ auto crs2ccs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, /// /// \tparam ScalarType The crsMatrix::scalar_type /// \tparam OrdinalType The crsMatrix::ordinal_type -/// \tparam Device The crsMatrix::device_type +/// \tparam DeviceType The crsMatrix::device_type /// \tparam MemoryTraits The crsMatrix::memory_traits /// \tparam SizeType The crsMatrix::size_type /// \param crsMatrix The KokkosSparse::CrsMatrix. diff --git a/sparse/src/KokkosSparse_crs2coo.hpp b/sparse/src/KokkosSparse_crs2coo.hpp new file mode 100644 index 0000000000..8292b26250 --- /dev/null +++ b/sparse/src/KokkosSparse_crs2coo.hpp @@ -0,0 +1,164 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_CooMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#ifndef _KOKKOSSPARSE_CRS2COO_HPP +#define _KOKKOSSPARSE_CRS2COO_HPP +namespace KokkosSparse { +namespace Impl { +template +class Crs2Coo { + private: + using scalar_type = typename ValViewType::value_type; + using const_scalar_type = const std::remove_const_t; + using non_const_scalar_type = std::remove_const_t; + + using ordinal_type = OrdinalType; + using const_ordinal_type = const std::remove_const_t; + using non_const_ordinal_type = std::remove_const_t; + + using size_type = SizeType; + using const_size_type = const std::remove_const_t; + using non_const_size_type = std::remove_const_t; + + using device_type = DeviceType; + + using row_view = typename Kokkos::View; + using col_view = row_view; + using non_const_coo_data_view = typename ValViewType::non_const_type; + using coo_type = CooMatrix; + + non_const_ordinal_type m_nrows; + non_const_ordinal_type m_ncols; + non_const_size_type m_nnz; + + non_const_coo_data_view m_data; + col_view m_col; + row_view m_row; + + ValViewType m_vals; + RowMapViewType m_row_map; + ColIdViewType m_col_ids; + + using copy_tp1_pt = Kokkos::TeamPolicy; + using copy_tp1_member_type = typename copy_tp1_pt::member_type; + + public: + Crs2Coo(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, + RowMapViewType row_map, ColIdViewType col_ids) + : m_nrows(nrows), + m_ncols(ncols), + m_nnz(nnz), + m_vals(vals), + m_row_map(row_map), + m_col_ids(col_ids) { + m_data = non_const_coo_data_view( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_data"), nnz); + m_col = + col_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_col"), nnz); + m_row = + row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_row"), nnz); + + copy_tp1_pt policy(m_nrows, 1, 1); + { + auto vec_len_max = policy.vector_length_max(); + copy_tp1_pt query_policy(m_nrows, 1, vec_len_max); + policy = copy_tp1_pt( + m_nrows, + query_policy.team_size_recommended(*this, Kokkos::ParallelForTag()), + vec_len_max); + } + + Kokkos::parallel_for("Crs2Coo", policy, *this); + DeviceType().fence(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copy_tp1_member_type &member) const { + auto i = member.league_rank(); + auto row_start = m_row_map(i); + auto row_len = m_row_map(i + 1) - row_start; + auto row_end = row_start + row_len; + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, row_start, row_end), + [&](const size_type &id) { + m_data(id) = m_vals(id); + m_col(id) = m_col_ids(id); + m_row(id) = i; + }); + } + + coo_type get_cooMat() { + return coo_type(m_nrows, m_ncols, m_row, m_col, m_data); + } +}; +} // namespace Impl +// clang-format off +/// +/// \brief Blocking function that converts a CrsMatrix to a CooMatrix. +/// Crs values are copied into the CooMatrix in the order they appear +/// within the CrsMatrix, starting from row 0 to row nrows - 1. +/// \tparam OrdinalType The view value type associated with the RowIdViewType +/// \tparam SizeType The type of nnz +/// \tparam ValViewType The values view type +/// \tparam RowMapViewType The column map view type +/// \tparam ColIdViewType The row ids view type +/// \param nrows The number of rows in the crs matrix +/// \param ncols The number of columns in the crs matrix +/// \param nnz The number of non-zeros in the crs matrix +/// \param vals The values view of the crs matrix +/// \param row_map The row map view of the crs matrix +/// \param col_ids The col ids view of the crs matrix +/// \return A KokkosSparse::CooMatrix. +/// +// clang-format on +template +auto crs2coo(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType vals, RowMapViewType row_map, ColIdViewType col_ids) { + using Crs2cooType = Impl::Crs2Coo; + Crs2cooType crs2Coo(nrows, ncols, nnz, vals, row_map, col_ids); + return crs2Coo.get_cooMat(); +} + +/// +/// @brief Blocking function that converts a CrsMatrix to a CooMatrix. +/// Crs values are copied into the CooMatrix in the order they appear +/// within the CrsMatrix, starting from row 0 to row nrows - 1. +/// +/// \tparam ScalarType The crsMatrix::scalar_type +/// \tparam OrdinalType The crsMatrix::ordinal_type +/// \tparam DeviceType The crsMatrix::device_type +/// \tparam MemoryTraits The crsMatrix::memory_traits +/// \tparam SizeType The crsMatrix::size_type +/// \param crsMatrix The KokkosSparse::CrsMatrix. +/// \return A KokkosSparse::CooMatrix. +template +auto crs2coo(KokkosSparse::CrsMatrix &crsMatrix) { + return crs2coo(crsMatrix.numRows(), crsMatrix.numCols(), crsMatrix.nnz(), + crsMatrix.values, crsMatrix.graph.row_map, + crsMatrix.graph.entries); +} +} // namespace KokkosSparse +#endif // _KOKKOSSPARSE_CRS2COO_HPP \ No newline at end of file diff --git a/sparse/src/KokkosSparse_findRelOffset.hpp b/sparse/src/KokkosSparse_findRelOffset.hpp index 3f806b411e..f4c702a610 100644 --- a/sparse/src/KokkosSparse_findRelOffset.hpp +++ b/sparse/src/KokkosSparse_findRelOffset.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOS_SPARSE_FINDRELOFFSET_HPP #define KOKKOS_SPARSE_FINDRELOFFSET_HPP -/// \file Kokkos_Sparse_findRelOffset.hpp +/// \file KokkosSparse_findRelOffset.hpp /// \brief Find the relative offset of a column index in a sparse /// graph's or sparse matrix's row. diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 89654243cc..9f1b9d8cb1 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -25,6 +25,23 @@ namespace KokkosSparse { namespace Experimental { +/// +/// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity +/// pattern only) +/// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously +/// template void gauss_seidel_symbolic(KernelHandle *handle, @@ -85,6 +102,23 @@ void gauss_seidel_symbolic(KernelHandle *handle, is_graph_symmetric); } +/// +/// @brief Block Gauss-Seidel preconditioner setup (first phase, based on +/// sparsity pattern only) +/// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param block_size The number of degrees of freedom per block +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously template void block_gauss_seidel_symbolic( @@ -104,6 +138,25 @@ void block_gauss_seidel_symbolic( is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values) +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// template num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. template num_rows x +/// num_rows submatrix of A is structurally symmetric +/// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// template ::type, \ typename std::remove_const::type>::value +/// @brief +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BType +/// @tparam XType +/// @param handle +/// @param A +/// @param B +/// @param X +/// @param precond template void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 672da5b4de..1c5216bfe5 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -66,9 +66,10 @@ template void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using value_type = typename crs_matrix_type::value_type; + using values_mag_type = + typename KokkosSparse::Impl::MDF_types::values_mag_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using value_mag_type = typename values_mag_type::value_type; using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; @@ -82,14 +83,14 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { crs_matrix_type Atmp = crs_matrix_type("A fill", A); crs_matrix_type At = KokkosSparse::Impl::transpose_matrix(A); KokkosSparse::sort_crs_matrix(At); - values_type discarded_fill("discarded fill", A.numRows()); + values_mag_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); col_ind_type update_list_length("update list length", 1); typename col_ind_type::HostMirror update_list_length_host = Kokkos::create_mirror_view(update_list_length); col_ind_type update_list("update list", A.numRows()); col_ind_type factored("factored rows", A.numRows()); - Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); + Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index c7def802b9..8ded6209ec 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -48,6 +48,28 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value +/// @brief Performs the symbolic phase of par_ilut. +/// This is a non-blocking function. +/// +/// The sparsity pattern of A will be analyzed and L_rowmap and U_rowmap will be +/// populated with the L (lower triangular) and U (upper triagular) non-zero +/// counts respectively. Having a separate symbolic phase allows for reuse when +/// dealing with multiple matrices with the same sparsity pattern. This routine +/// will set some values on handle for symbolic info (row count, nnz counts). +/// +/// @tparam KernelHandle Template for the KernelHandle type +/// @tparam ARowMapType Template for A_rowmap type +/// @tparam AEntriesType Template for A_entries type +/// @tparam LRowMapType Template for L_rowmap type +/// @tparam URowMapType Template for U_rowmap type +/// @param handle The kernel handle. It is expected that create_par_ilut_handle +/// has been called on it +/// @param A_rowmap The row map (row nnz offsets) for the A CSR (Input) +/// @param A_entries The entries (column ids) for the A CSR (Input) +/// @param L_rowmap The row map for the L CSR, should already be sized correctly +/// (numRows+1) (Output) +/// @param U_rowmap The row map for the U CSR, should already be sized correctly +/// (numRows+1) (Output) template void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, @@ -176,6 +198,34 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, } // par_ilut_symbolic +/// @brief Performs the numeric phase (for specific CSRs, not reusable) of the +/// par_ilut +/// algorithm (described in the header). This is a non-blocking +/// functions. It is expected that par_ilut_symbolic has already been +/// called for the +// provided KernelHandle. +/// +/// @tparam KernelHandle Template for the handle type +/// @tparam ARowMapType Template for the A_rowmap type +/// @tparam AEntriesType Template for the A_entries type +/// @tparam AValuesType Template for the A_values type +/// @tparam LRowMapType Template for the L_rowmap type +/// @tparam LEntriesType Template for the L_entries type +/// @tparam LValuesType Template for the L_values type +/// @tparam URowMapType Template for the U_rowmap type +/// @tparam UEntriesType Template for the U_entries type +/// @tparam UValuesType Template for the U_values type +/// @param handle The kernel handle. It is expected that create_par_ilut_handle +/// has been called on it +/// @param A_rowmap The row map (row nnz offsets) for the A CSR (Input) +/// @param A_entries The entries (column ids) for the A CSR (Input) +/// @param A_values The values (non-zero matrix values) for the A CSR (Input) +/// @param L_rowmap The row map (row nnz offsets) for the L CSR (Input/Output) +/// @param L_entries The entries (column ids) for the L CSR (Output) +/// @param L_values The values (non-zero matrix values) for the L CSR (Output) +/// @param U_rowmap The row map (row nnz offsets) for the U CSR (Input/Output) +/// @param U_entries The entries (column ids) for the U CSR (Output) +/// @param U_values The values (non-zero matrix values) for the U CSR (Output) template result_nnz_size = result_nnz_size_; } diff --git a/sparse/src/KokkosSparse_spgemm.hpp b/sparse/src/KokkosSparse_spgemm.hpp index 11cb58ed4a..882dfd5ec2 100644 --- a/sparse/src/KokkosSparse_spgemm.hpp +++ b/sparse/src/KokkosSparse_spgemm.hpp @@ -23,6 +23,20 @@ namespace KokkosSparse { +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BMatrix +/// @tparam CMatrix +/// @param kh +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @param C +//// template void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -54,7 +68,20 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, entriesC); } -// Symbolic phase for block SpGEMM (BSR matrices) +/// +/// @brief Symbolic phase for block SpGEMM (BSR matrices) +/// +/// @tparam KernelHandle +/// @tparam AMatrixType +/// @tparam BMatrixType +/// @tparam CMatrixType +/// @param kh +/// @param A +/// @param transposeA +/// @param B +/// @param transposeB +/// @param C +/// template void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, @@ -95,6 +122,20 @@ void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, row_mapC, entriesC, blockDim); } +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BMatrix +/// @tparam CMatrix +/// @param kh +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @param C +/// template void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -108,6 +149,20 @@ void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, B.values, Bmode, C.graph.row_map, C.graph.entries, C.values); } +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BMatrix +/// @tparam CMatrix +/// @param kh +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @param C +/// template void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -123,6 +178,18 @@ void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, B.values, Bmode, C.graph.row_map, C.graph.entries, C.values, blockDim); } +/// +/// @brief +/// +/// @tparam CMatrix +/// @tparam AMatrix +/// @tparam BMatrix +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @return CMatrix +/// template CMatrix spgemm(const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode) { diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index fdcceb2d93..1106d300c8 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -385,10 +385,8 @@ class SPGEMMHandle { num_used_colors = num_used_colors_; } - /** - * \brief sets the result nnz size. - * \param result_nnz_size: size of the output matrix. - */ + /// \brief sets the result nnz size. + /// \param result_nnz_size_ size of the output matrix. void set_c_nnz(size_type result_nnz_size_) { this->result_nnz_size = result_nnz_size_; } @@ -778,6 +776,70 @@ class SPGEMMHandle { } bool get_compression_step() { return is_compression_single_step; } + + private: + // An SpGEMM handle can be reused for multiple products C = A*B, but only if + // the sparsity patterns of A and B do not change. Enforce this (in debug + // builds only) by recording hashes of the graphs, and then checking they + // match in later calls. + bool computedInputHashes = false; + uint32_t a_graph_hash = 0U; + uint32_t b_graph_hash = 0U; + + public: + template + bool checkMatrixIdentitiesSymbolic(const a_rowptrs_t &a_rowptrsIn, + const a_entries_t &a_entriesIn, + const b_rowptrs_t &b_rowptrsIn, + const b_entries_t &b_entriesIn) { +#ifndef NDEBUG + // If this is the first symbolic call, assign the handle's CRS pointers to + // check against later + if (!computedInputHashes) { + a_graph_hash = KokkosKernels::Impl::hashView(a_rowptrsIn) ^ + KokkosKernels::Impl::hashView(a_entriesIn); + b_graph_hash = KokkosKernels::Impl::hashView(b_rowptrsIn) ^ + KokkosKernels::Impl::hashView(b_entriesIn); + computedInputHashes = true; + } else { + if (a_graph_hash != (KokkosKernels::Impl::hashView(a_rowptrsIn) ^ + KokkosKernels::Impl::hashView(a_entriesIn))) + return false; + if (b_graph_hash != (KokkosKernels::Impl::hashView(b_rowptrsIn) ^ + KokkosKernels::Impl::hashView(b_entriesIn))) + return false; + } +#else + (void)a_rowptrsIn; + (void)a_entriesIn; + (void)b_rowptrsIn; + (void)b_entriesIn; +#endif + return true; + } + + template + bool checkMatrixIdentitiesNumeric(const a_rowptrs_t &a_rowptrsIn, + const a_entries_t &a_entriesIn, + const b_rowptrs_t &b_rowptrsIn, + const b_entries_t &b_entriesIn) { +#ifndef NDEBUG + if (a_graph_hash != (KokkosKernels::Impl::hashView(a_rowptrsIn) ^ + KokkosKernels::Impl::hashView(a_entriesIn))) + return false; + if (b_graph_hash != (KokkosKernels::Impl::hashView(b_rowptrsIn) ^ + KokkosKernels::Impl::hashView(b_entriesIn))) + return false; +#else + (void)a_rowptrsIn; + (void)a_entriesIn; + (void)b_rowptrsIn; + (void)b_entriesIn; +#endif + return true; + } }; inline SPGEMMAlgorithm StringToSPGEMMAlgorithm(std::string &name) { diff --git a/sparse/src/KokkosSparse_spgemm_numeric.hpp b/sparse/src/KokkosSparse_spgemm_numeric.hpp index 043a01b9f1..e0930c04ee 100644 --- a/sparse/src/KokkosSparse_spgemm_numeric.hpp +++ b/sparse/src/KokkosSparse_spgemm_numeric.hpp @@ -236,7 +236,24 @@ void spgemm_numeric(KernelHandle *handle, return; } - auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); + auto spgemmHandle = tmp_handle.get_spgemm_handle(); + + if (!spgemmHandle) { + throw std::invalid_argument( + "KokkosSparse::spgemm_numeric: the given KernelHandle does not have " + "an SpGEMM handle associated with it."); + } + + if (!spgemmHandle->checkMatrixIdentitiesNumeric(const_a_r, const_a_l, + const_b_r, const_b_l)) { + throw std::invalid_argument( + "KokkosSparse::spgemm_numeric: once used, an spgemm handle cannot be " + "reused for a product with a different sparsity pattern.\n" + "The rowptrs and entries of A and B must be identical to those " + "passed to the first spgemm_symbolic and spgemm_numeric calls."); + } + + auto algo = spgemmHandle->get_algorithm_type(); if (algo == SPGEMM_DEBUG || algo == SPGEMM_SERIAL) { // Never call a TPL if serial/debug is requested (this is needed for diff --git a/sparse/src/KokkosSparse_spgemm_symbolic.hpp b/sparse/src/KokkosSparse_spgemm_symbolic.hpp index 486d999e41..2bde5f6e20 100644 --- a/sparse/src/KokkosSparse_spgemm_symbolic.hpp +++ b/sparse/src/KokkosSparse_spgemm_symbolic.hpp @@ -162,7 +162,24 @@ void spgemm_symbolic(KernelHandle *handle, } #endif - auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); + auto spgemmHandle = tmp_handle.get_spgemm_handle(); + + if (!spgemmHandle) { + throw std::invalid_argument( + "KokkosSparse::spgemm_symbolic: the given KernelHandle does not have " + "an SpGEMM handle associated with it."); + } + + if (!spgemmHandle->checkMatrixIdentitiesSymbolic(const_a_r, const_a_l, + const_b_r, const_b_l)) { + throw std::invalid_argument( + "KokkosSparse::spgemm_symbolic: once used, an spgemm handle cannot be " + "reused for a product with a different sparsity pattern.\n" + "The rowptrs and entries of A and B must be identical to those " + "passed to the first spgemm_symbolic call."); + } + + auto algo = spgemmHandle->get_algorithm_type(); if (algo == SPGEMM_DEBUG || algo == SPGEMM_SERIAL) { // Never call a TPL if serial/debug is requested (this is needed for diff --git a/sparse/src/KokkosSparse_spiluk.hpp b/sparse/src/KokkosSparse_spiluk.hpp index ac2afc066f..1bf78abe5e 100644 --- a/sparse/src/KokkosSparse_spiluk.hpp +++ b/sparse/src/KokkosSparse_spiluk.hpp @@ -46,7 +46,8 @@ void spiluk_symbolic(KernelHandle* handle, typename KernelHandle::const_nnz_lno_t fill_lev, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, LEntriesType& L_entries, - URowMapType& U_rowmap, UEntriesType& U_entries) { + URowMapType& U_rowmap, UEntriesType& U_entries, + int nstreams = 1) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -240,7 +241,7 @@ void spiluk_symbolic(KernelHandle* handle, LEntries_Internal, URowMap_Internal, UEntries_Internal>::spiluk_symbolic(&tmp_handle, fill_lev, A_rowmap_i, A_entries_i, L_rowmap_i, L_entries_i, - U_rowmap_i, U_entries_i); + U_rowmap_i, U_entries_i, nstreams); } // spiluk_symbolic @@ -522,21 +523,476 @@ void spiluk_numeric(KernelHandle* handle, UValues_Internal U_values_i = U_values; KokkosSparse::Impl::SPILUK_NUMERIC< - const_handle_type, ARowMap_Internal, AEntries_Internal, AValues_Internal, - LRowMap_Internal, LEntries_Internal, LValues_Internal, URowMap_Internal, - UEntries_Internal, UValues_Internal>::spiluk_numeric(&tmp_handle, - fill_lev, A_rowmap_i, - A_entries_i, - A_values_i, - L_rowmap_i, - L_entries_i, - L_values_i, - U_rowmap_i, - U_entries_i, - U_values_i); + typename AValuesType::execution_space, const_handle_type, + ARowMap_Internal, AEntries_Internal, AValues_Internal, LRowMap_Internal, + LEntries_Internal, LValues_Internal, URowMap_Internal, UEntries_Internal, + UValues_Internal>::spiluk_numeric(&tmp_handle, fill_lev, A_rowmap_i, + A_entries_i, A_values_i, L_rowmap_i, + L_entries_i, L_values_i, U_rowmap_i, + U_entries_i, U_values_i); } // spiluk_numeric +template +void spiluk_numeric_streams(const std::vector& execspace_v, + const std::vector& handle_v, + typename KernelHandle::const_nnz_lno_t fill_lev, + const std::vector& A_rowmap_v, + const std::vector& A_entries_v, + const std::vector& A_values_v, + const std::vector& L_rowmap_v, + const std::vector& L_entries_v, + std::vector& L_values_v, + const std::vector& U_rowmap_v, + const std::vector& U_entries_v, + std::vector& U_values_v) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using scalar_type = typename KernelHandle::nnz_scalar_t; + + static_assert(Kokkos::is_execution_space::value, + "ExecutionSpace is not valid"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename ARowMapType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "ARowMapType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename AEntriesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "AEntriesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename AValuesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "AValuesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename LRowMapType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "LRowMapType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename LEntriesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "LEntriesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename LValuesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "LValuesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename URowMapType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "URowMapType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename UEntriesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "UEntriesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename UValuesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "UValuesType"); + + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename ARowMapType::non_const_value_type, + size_type), + "spiluk_numeric_streams: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename AEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: A entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: A scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LRowMapType::non_const_value_type, + size_type), + "spiluk_numeric_streams: L size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename LEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: L entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: L scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename URowMapType::non_const_value_type, + size_type), + "spiluk_numeric_streams: U size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename UEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: U entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: U scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: A_rowmap is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: A_entries is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: A_values is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: L_rowmap is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: L_entries is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: L_values is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: U_rowmap is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: U_entries is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: U_values is not a Kokkos::View."); + + static_assert((int)LRowMapType::rank == (int)ARowMapType::rank, + "spiluk_numeric_streams: The ranks of L_rowmap and A_rowmap do " + "not match."); + static_assert((int)LEntriesType::rank == (int)AEntriesType::rank, + "spiluk_numeric_streams: The ranks of L_entries and A_entries " + "do not match."); + static_assert((int)LValuesType::rank == (int)AValuesType::rank, + "spiluk_numeric_streams: The ranks of L_values and A_values do " + "not match."); + + static_assert((int)LRowMapType::rank == (int)URowMapType::rank, + "spiluk_numeric_streams: The ranks of L_rowmap and U_rowmap do " + "not match."); + static_assert((int)LEntriesType::rank == (int)UEntriesType::rank, + "spiluk_numeric_streams: The ranks of L_entries and U_entries " + "do not match."); + static_assert((int)LValuesType::rank == (int)UValuesType::rank, + "spiluk_numeric_streams: The ranks of L_values and U_values do " + "not match."); + + static_assert(LRowMapType::rank == 1, + "spiluk_numeric_streams: A_rowmap, L_rowmap and U_rowmap must " + "all have rank 1."); + static_assert( + LEntriesType::rank == 1, + "spiluk_numeric_streams: A_entries, L_entries and U_entries must all " + "have rank 1."); + static_assert(LValuesType::rank == 1, + "spiluk_numeric_streams: A_values, L_values and U_values must " + "all have rank 1."); + + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output L_entries must be nonconst."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output L_values must be nonconst."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output U_entries must be nonconst."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output U_values must be nonconst."); + + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LRowMapType and ARowMapType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LEntriesType and AEntriesType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LValuesType and AValuesType have " + "different device_types."); + + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LRowMapType and URowMapType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LEntriesType and UEntriesType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LValuesType and UValuesType have " + "different device_types."); + + static_assert( + std::is_same< + ExecutionSpace, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle's execution space is different " + "from " + "ExecutionSpace."); + + static_assert( + std::is_same< + typename LRowMapType::device_type::execution_space, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle and Views have different execution " + "spaces."); + static_assert( + std::is_same< + typename LEntriesType::device_type::execution_space, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle and Views have different execution " + "spaces."); + static_assert( + std::is_same< + typename LValuesType::device_type::execution_space, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle and Views have different execution " + "spaces."); + + static_assert(std::is_same::value, + "spiluk_numeric_streams: rowmap and entries have different " + "device types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: rowmap and values have different device types."); + + // Check validity of fill level + if (fill_lev < 0) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: fill_lev: " + << fill_lev << ". Valid value is >= 0."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Check sizes of vectors + if (execspace_v.size() != handle_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. handle_v.size() " << handle_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != A_rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. A_rowmap_v.size() " << A_rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != A_entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. A_entries_v.size() " + << A_entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != A_values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. A_values_v.size() " << A_values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != L_rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. L_rowmap_v.size() " << L_rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != L_entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. L_entries_v.size() " + << L_entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != L_values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. L_values_v.size() " << L_values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != U_rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. U_rowmap_v.size() " << U_rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != U_entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. U_entries_v.size() " + << U_entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != U_values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. U_values_v.size() " << U_values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Check if symbolic has been called + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + if (handle_v[i]->get_spiluk_handle()->is_symbolic_complete() == false) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: " + "spiluk_symbolic must be " + "called before spiluk_numeric_streams -- stream " + << i; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + } + + using c_size_t = typename KernelHandle::const_size_type; + using c_lno_t = typename KernelHandle::const_nnz_lno_t; + using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; + using c_exec_t = typename KernelHandle::HandleExecSpace; + using c_temp_t = typename KernelHandle::HandleTempMemorySpace; + using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; + + using const_handle_type = + typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>; + + using ARowMap_Internal = Kokkos::View< + typename ARowMapType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename ARowMapType::device_type, + Kokkos::MemoryTraits >; + + using AEntries_Internal = Kokkos::View< + typename AEntriesType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout< + AEntriesType>::array_layout, + typename AEntriesType::device_type, + Kokkos::MemoryTraits >; + + using AValues_Internal = Kokkos::View< + typename AValuesType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename AValuesType::device_type, + Kokkos::MemoryTraits >; + + using LRowMap_Internal = Kokkos::View< + typename LRowMapType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename LRowMapType::device_type, + Kokkos::MemoryTraits >; + + using LEntries_Internal = Kokkos::View< + typename LEntriesType::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout< + LEntriesType>::array_layout, + typename LEntriesType::device_type, + Kokkos::MemoryTraits >; + + using LValues_Internal = Kokkos::View< + typename LValuesType::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + using URowMap_Internal = Kokkos::View< + typename URowMapType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename URowMapType::device_type, + Kokkos::MemoryTraits >; + + using UEntries_Internal = Kokkos::View< + typename UEntriesType::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout< + UEntriesType>::array_layout, + typename UEntriesType::device_type, + Kokkos::MemoryTraits >; + + using UValues_Internal = Kokkos::View< + typename UValuesType::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + std::vector handle_i_v(execspace_v.size()); + std::vector A_rowmap_i_v(execspace_v.size()); + std::vector A_entries_i_v(execspace_v.size()); + std::vector A_values_i_v(execspace_v.size()); + std::vector L_rowmap_i_v(execspace_v.size()); + std::vector L_entries_i_v(execspace_v.size()); + std::vector L_values_i_v(execspace_v.size()); + std::vector U_rowmap_i_v(execspace_v.size()); + std::vector U_entries_i_v(execspace_v.size()); + std::vector U_values_i_v(execspace_v.size()); + + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + handle_i_v[i] = const_handle_type(*(handle_v[i])); + A_rowmap_i_v[i] = A_rowmap_v[i]; + A_entries_i_v[i] = A_entries_v[i]; + A_values_i_v[i] = A_values_v[i]; + L_rowmap_i_v[i] = L_rowmap_v[i]; + L_entries_i_v[i] = L_entries_v[i]; + L_values_i_v[i] = L_values_v[i]; + U_rowmap_i_v[i] = U_rowmap_v[i]; + U_entries_i_v[i] = U_entries_v[i]; + U_values_i_v[i] = U_values_v[i]; + } + + KokkosSparse::Impl::SPILUK_NUMERIC< + ExecutionSpace, const_handle_type, ARowMap_Internal, AEntries_Internal, + AValues_Internal, LRowMap_Internal, LEntries_Internal, LValues_Internal, + URowMap_Internal, UEntries_Internal, + UValues_Internal>::spiluk_numeric_streams(execspace_v, handle_i_v, + A_rowmap_i_v, A_entries_i_v, + A_values_i_v, L_rowmap_i_v, + L_entries_i_v, L_values_i_v, + U_rowmap_i_v, U_entries_i_v, + U_values_i_v); + +} // spiluk_numeric_streams + } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 0c097b4a43..60fb5331cf 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -53,7 +53,7 @@ struct RANK_TWO {}; /// \param x [in] A vector. /// \param beta [in] Scalar multiplier for the multivector y. /// \param y [in/out] vector. -/// \param RANK_ONE tag dispatch +/// \param tag RANK_ONE dispatch /// #ifdef DOXY // documentation version template ) { + useFallback = useFallback || (mode[0] != NoTranspose[0]); + } +#endif + if (useFallback) { // Explicitly call the non-TPL SPMV_BSRMATRIX implementation std::string label = @@ -502,8 +511,8 @@ struct SPMV2D1D +#include +#include +#include // requires C++11, but so does Kokkos +#include + +namespace KokkosSparse { +namespace Experimental { + +/// \brief Sparse matrix-vector multiply: y = beta*y + alpha*A*x. +/// +template +int KOKKOS_INLINE_FUNCTION team_spmv( + const TeamType &team, const ScalarType &alpha, const ValuesViewType &values, + const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const ScalarType &beta, const yViewType &y, const int dobeta) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "ValuesViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "IntView must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "xViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "yViewType must be a Kokkos::View."); + static_assert(static_cast(ValuesViewType::rank) == 1, + "ValuesViewType must have rank 1."); + static_assert(static_cast(IntView::rank) == 1, + "IntView must have rank 1."); + static_assert(static_cast(xViewType::rank) == 1, + "xViewType must have rank 1."); + static_assert(static_cast(yViewType::rank) == 1, + "yViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (values.extent(0) != colIndices.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); + return 1; + } + + if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); + return 1; + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + if (dobeta == 1) + return KokkosSparse::TeamSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 1>( + team, alpha, values, row_ptr, colIndices, x, beta, y); + else + return KokkosSparse::TeamSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 0>( + team, alpha, values, row_ptr, colIndices, x, beta, y); +} + +/// \brief Sparse matrix-vector multiply: y = beta*y + alpha*A*x. +/// +template +int KOKKOS_INLINE_FUNCTION team_vector_spmv( + const TeamType &team, const ScalarType &alpha, const ValuesViewType &values, + const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const ScalarType &beta, const yViewType &y, const int dobeta) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "ValuesViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "IntView must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "xViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "yViewType must be a Kokkos::View."); + static_assert(static_cast(ValuesViewType::rank) == 1, + "ValuesViewType must have rank 1."); + static_assert(static_cast(IntView::rank) == 1, + "IntView must have rank 1."); + static_assert(static_cast(xViewType::rank) == 1, + "xViewType must have rank 1."); + static_assert(static_cast(yViewType::rank) == 1, + "yViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (values.extent(0) != colIndices.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); + return 1; + } + + if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); + return 1; + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + if (dobeta == 1) + return KokkosSparse::TeamVectorSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 1>( + team, alpha, values, row_ptr, colIndices, x, beta, y); + else + return KokkosSparse::TeamVectorSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 0>( + team, alpha, values, row_ptr, colIndices, x, beta, y); +} + +} // namespace Experimental +} // namespace KokkosSparse + +#endif // KOKKOS_BLAS2_MV_HPP_ diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 04cb0f5285..859918c58d 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -312,10 +312,10 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, } else { KokkosSparse::Impl::SPTRSV_SOLVE< - const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, - BType_Internal, XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, - entries_i, values_i, b_i, - x_i); + typename scalar_nnz_view_t_::execution_space, const_handle_type, + RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, + values_i, b_i, x_i); } } // sptrsv_solve @@ -369,6 +369,224 @@ void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, } #endif +template +void sptrsv_solve_streams(const std::vector &execspace_v, + const std::vector &handle_v, + const std::vector &rowmap_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &b_v, + std::vector &x_v) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using scalar_type = typename KernelHandle::nnz_scalar_t; + + static_assert(Kokkos::is_execution_space::value, + "ExecutionSpace is not valid"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename lno_row_view_t_::memory_space>::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in " + "lno_row_view_t_"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename lno_nnz_view_t_::memory_space>::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in " + "lno_nnz_view_t_"); + static_assert(Kokkos::SpaceAccessibility< + ExecutionSpace, + typename scalar_nnz_view_t_::memory_space>::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in " + "scalar_nnz_view_t_"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in BType"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in XType"); + + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( + typename lno_row_view_t_::non_const_value_type, size_type), + "sptrsv_solve_streams: A size_type must match KernelHandle " + "size_type (const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPTRSV_SAME_TYPE( + typename lno_nnz_view_t_::non_const_value_type, ordinal_type), + "sptrsv_solve_streams: A entry type must match KernelHandle entry type " + "(aka nnz_lno_t, and const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( + typename scalar_nnz_view_t_::value_type, scalar_type), + "sptrsv_solve_streams: A scalar type must match KernelHandle " + "entry type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(Kokkos::is_view::value, + "sptrsv_solve_streams: b is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "sptrsv_solve_streams: x is not a Kokkos::View."); + static_assert((int)BType::rank == (int)XType::rank, + "sptrsv_solve_streams: The ranks of b and x do not match."); + static_assert(BType::rank == 1, + "sptrsv_solve_streams: b and x must both either have rank 1."); + static_assert(std::is_same::value, + "sptrsv_solve_streams: The output x must be nonconst."); + static_assert(std::is_same::value, + "sptrsv_solve_streams: Views BType and XType have different " + "device_types."); + static_assert( + std::is_same< + ExecutionSpace, + typename KernelHandle::SPTRSVHandleType::execution_space>::value, + "sptrsv_solve_streams: KernelHandle's execution space is different from " + "ExecutionSpace."); + static_assert( + std::is_same< + typename BType::device_type::execution_space, + typename KernelHandle::SPTRSVHandleType::execution_space>::value, + "sptrsv_solve_streams: KernelHandle and Views have different execution " + "spaces."); + static_assert( + std::is_same::value, + "sptrsv_solve_streams: rowmap and entries have different device types."); + static_assert( + std::is_same::value, + "sptrsv_solve_streams: rowmap and values have different device types."); + + // Check sizes of vectors + if (execspace_v.size() != handle_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. handle_v.size() " << handle_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. rowmap_v.size() " << rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. entries_v.size() " << entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. values_v.size() " << values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != b_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. b_v.size() " << b_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != x_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. x_v.size() " << x_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + using c_size_t = typename KernelHandle::const_size_type; + using c_lno_t = typename KernelHandle::const_nnz_lno_t; + using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; + using c_exec_t = typename KernelHandle::HandleExecSpace; + using c_temp_t = typename KernelHandle::HandleTempMemorySpace; + using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; + + using const_handle_type = + typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>; + + using RowMap_Internal = Kokkos::View< + typename lno_row_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_row_view_t_>::array_layout, + typename lno_row_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Entries_Internal = Kokkos::View< + typename lno_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_nnz_view_t_>::array_layout, + typename lno_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Values_Internal = Kokkos::View< + typename scalar_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + scalar_nnz_view_t_>::array_layout, + typename scalar_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + using BType_Internal = Kokkos::View< + typename BType::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename BType::device_type, + Kokkos::MemoryTraits >; + + using XType_Internal = Kokkos::View< + typename XType::non_const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XType::device_type, Kokkos::MemoryTraits >; + + std::vector handle_i_v(execspace_v.size()); + std::vector rowmap_i_v(execspace_v.size()); + std::vector entries_i_v(execspace_v.size()); + std::vector values_i_v(execspace_v.size()); + std::vector b_i_v(execspace_v.size()); + std::vector x_i_v(execspace_v.size()); + + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + handle_i_v[i] = const_handle_type(*(handle_v[i])); + rowmap_i_v[i] = rowmap_v[i]; + entries_i_v[i] = entries_v[i]; + values_i_v[i] = values_v[i]; + b_i_v[i] = b_v[i]; + x_i_v[i] = x_v[i]; + } + + if (handle_v[0]->get_sptrsv_handle()->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + // NOTE: assume all streams use the same SPTRSV_CUSPARSE algo. + KokkosSparse::Impl::sptrsvcuSPARSE_solve_streams< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>( + execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, + x_i_v, false); + + } else { + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, + rowmap_i_v, entries_i_v, + values_i_v, b_i_v, x_i_v); + } + +} // sptrsv_solve_streams + } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_trsv.hpp b/sparse/src/KokkosSparse_trsv.hpp index ce970ac370..1363542f1b 100644 --- a/sparse/src/KokkosSparse_trsv.hpp +++ b/sparse/src/KokkosSparse_trsv.hpp @@ -14,7 +14,7 @@ // //@HEADER -/// \file Kokkos_Sparse_trsv.hpp +/// \file KokkosSparse_trsv.hpp /// \brief Local sparse triangular solve /// /// This file provides KokkosSparse::trsv. This function performs a diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp index 81d3273e17..ea3edb518f 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" +#endif + namespace KokkosSparse { namespace Impl { @@ -59,18 +63,21 @@ SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_noreuse_tpl_spec_avail< \ - KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>> { \ - enum : bool { value = true }; \ +#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_noreuse_tpl_spec_avail< \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>> { \ + enum : bool { value = true }; \ }; #define SPGEMM_NOREUSE_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp index f3d32a01fb..1067f3924f 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp @@ -226,33 +226,39 @@ Matrix spgemm_noreuse_mkl(const MatrixConst &A, const MatrixConst &B) { return Matrix("C", m, k, c_nnz, valuesC, row_mapC, entriesC); } -#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NOREUSE< \ - KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - true, TPL_AVAIL> { \ - using Matrix = KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>; \ - using ConstMatrix = KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>; \ - static KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int> \ - spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ - std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - Matrix C = spgemm_noreuse_mkl(A, B); \ - Kokkos::Profiling::popRegion(); \ - return C; \ - } \ +#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + true, TPL_AVAIL> { \ + using Matrix = \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>; \ + using ConstMatrix = KokkosSparse::CrsMatrix< \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>; \ + static KokkosSparse::CrsMatrix, \ + void, MKL_INT> \ + spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ + std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + Matrix C = spgemm_noreuse_mkl(A, B); \ + Kokkos::Profiling::popRegion(); \ + return C; \ + } \ }; #define SPGEMM_NOREUSE_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index bfba70d913..e144b53162 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" +#endif + namespace KokkosSparse { namespace Impl { @@ -129,40 +133,40 @@ SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_NUMERIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 5f555f926e..6c87c60caf 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -129,7 +129,7 @@ template void spgemm_numeric_cusparse( - KernelHandle *handle, lno_t m, lno_t n, lno_t k, + KernelHandle *handle, lno_t /*m*/, lno_t /*n*/, lno_t /*k*/, const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA, const ConstValuesType &valuesA, const ConstRowMapType &row_mapB, const ConstEntriesType &entriesB, const ConstValuesType &valuesB, @@ -553,30 +553,30 @@ void spgemm_numeric_mkl( #define SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ template <> \ struct SPGEMM_NUMERIC, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>; \ using c_int_view_t = \ - Kokkos::View, \ Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ Kokkos::MemoryTraits>; \ using c_scalar_view_t = \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 80454be92b..1fcfa7132a 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -101,28 +105,28 @@ SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_SYMBOLIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index 5db0fa18a9..e662934d00 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -594,8 +594,10 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(0); return; } - MKLMatrix A(m, n, (int *)rowptrA.data(), (int *)colidxA.data(), nullptr); - MKLMatrix B(n, k, (int *)rowptrB.data(), (int *)colidxB.data(), nullptr); + MKLMatrix A(m, n, (MKL_INT *)rowptrA.data(), (MKL_INT *)colidxA.data(), + nullptr); + MKLMatrix B(n, k, (MKL_INT *)rowptrB.data(), (MKL_INT *)colidxB.data(), + nullptr); sparse_matrix_t C; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; @@ -621,53 +623,53 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(rowptrC(m)); } -#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_SYMBOLIC< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>; \ - using c_int_view_t = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_symbolic(KernelHandle *handle, \ - typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, \ - typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, \ - bool, c_int_view_t row_mapB, \ - c_int_view_t entriesB, bool, \ - int_view_t row_mapC, bool) { \ - std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, \ - entriesA, row_mapB, entriesB, row_mapC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_symbolic(KernelHandle *handle, \ + typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, \ + typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, \ + bool, c_int_view_t row_mapB, \ + c_int_view_t entriesB, bool, \ + int_view_t row_mapC, bool) { \ + std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, \ + entriesA, row_mapB, entriesB, row_mapC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_SYMBOLIC_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp index cf2a653e2a..87a4b9f001 100644 --- a/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp @@ -20,10 +20,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spiluk_numeric_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 97a9790385..b9c1f6c1dd 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Experimental { namespace Impl { @@ -124,8 +128,9 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ @@ -248,6 +253,68 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + +#include "KokkosSparse_Utils_rocsparse.hpp" + +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50200 + +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(float, rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(double, rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(float, rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(double, rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) + +#endif // KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50200 + +#undef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE + +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + } // namespace Impl } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 0e0fe463a5..c6136eab3e 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -17,6 +17,7 @@ #ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#include "KokkosKernels_AlwaysFalse.hpp" #include "KokkosKernels_Controls.hpp" #include "KokkosSparse_Utils_mkl.hpp" @@ -41,14 +42,15 @@ inline matrix_descr getDescription() { } inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, - int m, int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, + MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( @@ -56,15 +58,15 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, } inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, - const double* Avalues, const double* x, - double* y) { + double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, + const double* x, double* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( @@ -73,16 +75,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -94,16 +97,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); matrix_descr A_descr = getDescription(); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; @@ -114,15 +118,16 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, } inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, - float beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, - const float* Avalues, const float* x, - int colx, int ldx, float* y, int ldy) { + float beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, MKL_INT colx, MKL_INT ldx, + float* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, @@ -131,15 +136,17 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, } inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, + double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, - int colx, int ldx, double* y, int ldy) { + MKL_INT colx, MKL_INT ldx, double* y, + MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, @@ -147,19 +154,17 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, ldx, beta, y, ldy)); } -inline void spm_mv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, - const int* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, int ldy) { +inline void spm_mv_block_impl_mkl( + sparse_operation_t op, Kokkos::complex alpha, + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -172,15 +177,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, inline void spm_mv_block_impl_mkl( sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, int b, const int* Arowptrs, - const int* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, int ldx, - Kokkos::complex* y, int ldy) { + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); matrix_descr A_descr = getDescription(); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; @@ -195,25 +200,26 @@ inline void spm_mv_block_impl_mkl( #if (__INTEL_MKL__ == 2017) -inline void spmv_block_impl_mkl(char mode, float alpha, float beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, +inline void spmv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spmv_block_impl_mkl(char mode, double alpha, double beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const double* Avalues, +inline void spmv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -228,8 +234,9 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -244,31 +251,31 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); } -inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, - const float* x, int colx, int ldx, float* y, - int ldy) { +inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, MKL_INT colx, MKL_INT ldx, + float* y, MKL_INT ldy) { mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const double* Avalues, - const double* x, int colx, int ldx, double* y, - int ldy) { +inline void spm_mv_block_impl_mkl( + char mode, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, + const double* x, MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); } inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, - const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, int ldy) { + const Kokkos::complex* x, MKL_INT colx, + MKL_INT ldx, Kokkos::complex* y, + MKL_INT ldy) { const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); const MKL_Complex8* Avalues_mkl = @@ -279,11 +286,14 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); } -inline void spm_mv_block_impl_mkl( - char mode, Kokkos::complex alpha, Kokkos::complex beta, - int m, int n, int b, const int* Arowptrs, const int* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - int colx, int ldx, Kokkos::complex* y, int ldy) { +inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, + Kokkos::complex* y, MKL_INT ldy) { const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); @@ -300,16 +310,17 @@ inline void spm_mv_block_impl_mkl( #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, int const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -357,8 +368,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const**, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ SCALAR**, Kokkos::LayoutLeft, \ @@ -366,8 +378,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, int const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -382,9 +394,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - int colx = static_cast(X.extent(1)); \ - int ldx = static_cast(X.stride_1()); \ - int ldy = static_cast(Y.stride_1()); \ + MKL_INT colx = static_cast(X.extent(1)); \ + MKL_INT ldx = static_cast(X.stride_1()); \ + MKL_INT ldy = static_cast(Y.stride_1()); \ spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ A.numCols(), A.blockDim(), A.graph.row_map.data(), \ A.graph.entries.data(), A.values.data(), X.data(), \ @@ -421,7 +433,7 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -525,7 +537,7 @@ void spmv_block_impl_cusparse( } KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) } // Reference @@ -542,7 +554,7 @@ void spmv_block_impl_cusparse( // -> t(t(C)) = t(A * t(B)) + t(t(C)) // -> C = t(t(B)) * t(A) + C // -> C = B * t(A) + C -// This is impossible in cuSparse without explicitly transposing C, +// This is impossible in cuSparse without explicitly transposing A, // so we just do not support LayoutRight in cuSparse TPL now // template < @@ -647,7 +659,7 @@ void spm_mv_block_impl_cusparse( } KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) } #define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ @@ -735,7 +747,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif // 9000 <= CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) #undef KOKKOSSPARSE_SPMV_CUSPARSE @@ -803,7 +815,7 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::CudaUVMSpace, false) -#endif // 9000 <= CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE @@ -813,4 +825,256 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// -------------------- +// rocSparse +// -------------------- +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + +#include + +#include "KokkosSparse_Utils_rocsparse.hpp" + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +template +void spmv_block_impl_rocsparse( + const KokkosKernels::Experimental::Controls& controls, const char mode[], + typename YVector::non_const_value_type const& alpha, const AMatrix& A, + const XVector& x, typename YVector::non_const_value_type const& beta, + const YVector& y) { + /* + rocm 5.4.0 rocsparse_*bsrmv reference: + https://rocsparse.readthedocs.io/en/rocm-5.4.0/usermanual.html#rocsparse-bsrmv-ex + + only trans = rocsparse_operation_none is supported + only descr = rocsparse_matrix_type_general is supported + + */ + + using offset_type = typename AMatrix::non_const_size_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using rocsparse_value_type = + typename KokkosSparse::Impl::kokkos_to_rocsparse_type::type; + + // assert ordinals and offsets are the expected types + static_assert(std::is_same_v, + "A offset_type must be rocsparse_int"); + static_assert(std::is_same_v, + "A ordinal_type must be rocsparse_int"); + + // assert all operands are the same type + using x_value_type = typename XVector::non_const_value_type; + using y_value_type = typename YVector::non_const_value_type; + static_assert(std::is_same_v, + "A and x must have same value type"); + static_assert(std::is_same_v, + "A and y must have same value type"); + + // assert X and Y are non-stride (pass raw pointers to TPL) + static_assert( + !std::is_same_v, + "x must be contiguous"); + static_assert( + !std::is_same_v, + "y must be contiguous"); + + // assert BSR data is non-stride (pass raw pointers to TPL) + static_assert(!std::is_same_v, + "A values must be contiguous"); + static_assert(!std::is_same_v, + "A row_map must be contiguous"); + static_assert(!std::is_same_v, + "A entries must be contiguous"); + + rocsparse_handle handle = controls.getRocsparseHandle(); + + // set the mode + rocsparse_operation trans; + switch (toupper(mode[0])) { + case 'N': trans = rocsparse_operation_none; break; + default: { + std::stringstream ss; + ss << "Mode " << mode << " invalid for rocsparse_[*]bsrmv\n"; + throw std::invalid_argument(ss.str()); + } + } + + /* + Specify the matrix direction. + The rocsparse_direction indicates whether a dense matrix should be parsed by + rows or by columns, assuming column-major storage. Values: enumerator + rocsparse_direction_row Parse the matrix by rows. enumerator + rocsparse_direction_column Parse the matrix by columns. + */ + // KokkosSparse Bsr matrix blocks are layoutright (row-major) + static_assert( + std::is_same_v, + "A blocks must be stored layout-right"); + rocsparse_direction dir = rocsparse_direction_row; + + const rocsparse_int mb = rocsparse_int(A.numRows()); // number of block rows + const rocsparse_int nb = rocsparse_int(A.numCols()); // number of block cols + const rocsparse_int nnzb = + rocsparse_int(A.nnz()); // number of non-zero blocks + const rocsparse_value_type* alpha_ = + reinterpret_cast(&alpha); + + const rocsparse_value_type* bsr_val = + reinterpret_cast(A.values.data()); + const rocsparse_int* bsr_row_ptr = A.graph.row_map.data(); + const rocsparse_int* bsr_col_ind = A.graph.entries.data(); + const rocsparse_int block_dim = rocsparse_int(A.blockDim()); + const rocsparse_value_type* x_ = + reinterpret_cast(x.data()); + const rocsparse_value_type* beta_ = + reinterpret_cast(&beta); + rocsparse_value_type* y_ = reinterpret_cast(y.data()); + + rocsparse_mat_descr descr; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr)); + rocsparse_mat_info info; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info)); + + // *_ex* functions introduced in 5.4.0 +#if KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } +#else + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } +#endif + rocsparse_destroy_mat_descr(descr); + rocsparse_destroy_mat_info(info); + +} // spmv_block_impl_rocsparse + +#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = BsrMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_block_impl_rocsparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSSPARSE_SPMV_ROCSPARSE(float, rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(float, rocsparse_int, rocsparse_int, + Kokkos::LayoutRight, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(double, rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(double, rocsparse_int, rocsparse_int, + Kokkos::LayoutRight, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutRight, + Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutRight, + Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + +#undef KOKKOSSPARSE_SPMV_ROCSPARSE + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + #endif // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 4a92741cc5..060fef45bb 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -214,8 +218,9 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index f223ed0e5a..ecbe45c7fd 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -94,7 +94,7 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, size_t bufferSize = 0; void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11201 +#if CUSPARSE_VERSION >= 11301 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; #else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; @@ -102,13 +102,13 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, if (controls.isParameter("algorithm")) { const std::string algName = controls.getParameter("algorithm"); if (algName == "default") -#if CUSPARSE_VERSION >= 11201 +#if CUSPARSE_VERSION >= 11301 alg = CUSPARSE_SPMV_ALG_DEFAULT; #else alg = CUSPARSE_MV_ALG_DEFAULT; #endif else if (algName == "merge") -#if CUSPARSE_VERSION >= 11201 +#if CUSPARSE_VERSION >= 11301 alg = CUSPARSE_SPMV_CSR_ALG2; #else alg = CUSPARSE_CSRMV_ALG2; @@ -343,6 +343,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, // rocSPARSE #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) #include +#include #include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { @@ -421,6 +422,16 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, else if (algName == "merge") alg = rocsparse_spmv_alg_csr_stream; } + +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); +#else KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, compute_type, alg, &buffer_size, tmp_buffer)); @@ -428,6 +439,7 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, compute_type, alg, &buffer_size, tmp_buffer)); +#endif KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(tmp_buffer)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY)); @@ -494,6 +506,8 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#undef KOKKOSSPARSE_SPMV_ROCSPARSE + } // namespace Impl } // namespace KokkosSparse #endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE @@ -508,41 +522,43 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, - int n, const int* Arowptrs, const int* Aentries, - const float* Avalues, const float* x, float* y) { +inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, + MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, float* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } -inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, - int n, const int* Arowptrs, const int* Aentries, - const double* Avalues, const double* x, double* y) { +inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, + MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, + const double* x, double* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -552,8 +568,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -563,8 +579,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -574,8 +590,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; @@ -587,16 +603,17 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, int const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -633,23 +650,23 @@ inline char mode_kk_to_mkl(char mode_kk) { "Invalid mode for MKL (should be one of N, T, H)"); } -inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, - const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, - const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(char mode, double alpha, double beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -664,8 +681,8 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, } inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -683,16 +700,17 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, int const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ diff --git a/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp index c12e8bb335..1a22146d01 100644 --- a/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp @@ -20,8 +20,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sptrsv_solve_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index 647fff4c18..e0d0085be1 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -16,6 +16,10 @@ #ifndef TEST_SPARSE_HPP #define TEST_SPARSE_HPP +#if KOKKOS_VERSION >= 40099 +#include "Test_Sparse_coo2crs.hpp" +#endif // KOKKOS_VERSION >= 40099 +#include "Test_Sparse_crs2coo.hpp" #include "Test_Sparse_block_gauss_seidel.hpp" #include "Test_Sparse_Controls.hpp" #include "Test_Sparse_CrsMatrix.hpp" diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 935b994045..63c977ca9a 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -32,9 +32,19 @@ #include #include +namespace SortCrsTest { +enum : int { + Instance, // Passing in an instance, and deducing template args + ExplicitType, // Using default instance, but specifying type with template + // arg + ImplicitType // Using default instance, and deducing type based on view +}; +} + template void testSortCRS(default_lno_t numRows, default_lno_t numCols, - default_size_type nnz, bool doValues, bool doStructInterface) { + default_size_type nnz, bool doValues, bool doStructInterface, + int howExecSpecified) { using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; @@ -42,9 +52,6 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, using device_t = Kokkos::Device; using crsMat_t = KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; // Create a random matrix on device // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this // wouldn't test anything @@ -89,17 +96,52 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, // call the actual sort routine being tested if (doValues) { if (doStructInterface) { - KokkosSparse::sort_crs_matrix(A); + switch (howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_matrix(exec_space(), A); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: KokkosSparse::sort_crs_matrix(A); + } } else { - KokkosSparse::sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); + switch (howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_matrix(exec_space(), A.graph.row_map, + A.graph.entries, A.values); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_crs_matrix(A.graph.row_map, + A.graph.entries, A.values); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_crs_matrix(A.graph.row_map, A.graph.entries, + A.values); + } } } else { if (doStructInterface) { - KokkosSparse::sort_crs_graph(A.graph); + switch (howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_graph(exec_space(), A.graph); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: KokkosSparse::sort_crs_graph(A.graph); + } } else { - KokkosSparse::sort_crs_graph( - A.graph.row_map, A.graph.entries); + switch (howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_graph(exec_space(), A.graph.row_map, + A.graph.entries); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_crs_graph(A.graph.row_map, + A.graph.entries); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_crs_graph(A.graph.row_map, A.graph.entries); + } } } // Copy to host and compare @@ -166,7 +208,8 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { } template -void testSortAndMerge() { +void testSortAndMerge(bool justGraph, int howExecSpecified, + bool doStructInterface, int testCase) { using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; @@ -174,109 +217,303 @@ void testSortAndMerge() { using device_t = Kokkos::Device; using crsMat_t = KokkosSparse::CrsMatrix; + using graph_t = typename crsMat_t::staticcrsgraph_type; using rowmap_t = typename crsMat_t::row_map_type::non_const_type; using entries_t = typename crsMat_t::index_type; using values_t = typename crsMat_t::values_type; using Kokkos::HostSpace; using Kokkos::MemoryTraits; using Kokkos::Unmanaged; - // Create a small CRS matrix on host - std::vector inRowmap = {0, 4, 4, 5, 7, 10}; - std::vector inEntries = { - 4, 3, 5, 3, // row 0 - // row 1 has no entries - 6, // row 2 - 2, 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector inValues = { - 1.5, 4, 1, -3, // row 0 - // row 1 - 2, // row 2 - -1, -2, // row 3 - 0, 3.5, -2.25 // row 4 - }; - lno_t nrows = 5; - lno_t ncols = 7; + // Select a test case: matrices and correct ouptut are hardcoded for each + std::vector inRowmap; + std::vector inEntries; + std::vector inValues; + std::vector goldRowmap; + std::vector goldEntries; + std::vector goldValues; + lno_t nrows = 0; + lno_t ncols = 0; + switch (testCase) { + case 0: { + // Two merges take place, and one depends on sorting being done correctly + nrows = 5; + ncols = 7; + inRowmap = {0, 4, 4, 5, 7, 10}; + inEntries = { + 4, 3, 5, 3, // row 0 + // row 1 has no entries + 6, // row 2 + 2, 2, // row 3 + 0, 1, 2 // row 4 + }; + // note: choosing values that can be represented exactly by float + inValues = { + 1.5, 4, 1, -3, // row 0 + // row 1 + 2, // row 2 + -1, -2, // row 3 + 0, 3.5, -2.25 // row 4 + }; + // Expect 2 merges to have taken place + goldRowmap = {0, 3, 3, 4, 5, 8}; + goldEntries = { + 3, 4, 5, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + goldValues = { + 1, 1.5, 1, // row 0 + // row 1 + 2, // row 2 + -3, // row 3 + 0, 3.5, -2.25 // row 4 + }; + break; + } + case 1: { + // Same as above, but no merges take place + nrows = 5; + ncols = 7; + inRowmap = {0, 3, 3, 4, 5, 8}; + inEntries = { + 4, 5, 3, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + inValues = { + 1.5, 4, 1, // row 0 + // row 1 + 2, // row 2 + -1, // row 3 + 0, 3.5, -2.25 // row 4 + }; + // Expect 2 merges to have taken place + goldRowmap = {0, 3, 3, 4, 5, 8}; + goldEntries = { + 3, 4, 5, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + goldValues = { + 1, 1.5, 4, // row 0 + // row 1 + 2, // row 2 + -1, // row 3 + 0, 3.5, -2.25 // row 4 + }; + break; + } + case 2: { + // Nonzero dimensions but no entries + nrows = 5; + ncols = 7; + inRowmap = {0, 0, 0, 0, 0, 0}; + goldRowmap = inRowmap; + break; + } + case 3: { + // Zero rows, length-zero rowmap + break; + } + case 4: { + // Zero rows, length-one rowmap + inRowmap = {0}; + goldRowmap = {0}; + break; + } + } size_type nnz = inEntries.size(); Kokkos::View> hostInRowmap( - inRowmap.data(), nrows + 1); + inRowmap.data(), inRowmap.size()); Kokkos::View> hostInEntries( inEntries.data(), nnz); Kokkos::View> hostInValues( inValues.data(), nnz); - rowmap_t devInRowmap("", nrows + 1); - entries_t devInEntries("", nnz); - values_t devInValues("", nnz); + rowmap_t devInRowmap("in rowmap", inRowmap.size()); + entries_t devInEntries("in entries", nnz); + values_t devInValues("in values", nnz); Kokkos::deep_copy(devInRowmap, hostInRowmap); Kokkos::deep_copy(devInEntries, hostInEntries); Kokkos::deep_copy(devInValues, hostInValues); crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, devInEntries); - crsMat_t output = KokkosSparse::sort_and_merge_matrix(input); - exec_space().fence(); - EXPECT_EQ(output.numRows(), nrows); - EXPECT_EQ(output.numCols(), ncols); + crsMat_t output; + if (justGraph) { + graph_t outputGraph; + // Testing sort_and_merge_graph + if (doStructInterface) { + switch (howExecSpecified) { + case SortCrsTest::Instance: + outputGraph = + KokkosSparse::sort_and_merge_graph(exec_space(), input.graph); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: + outputGraph = KokkosSparse::sort_and_merge_graph(input.graph); + } + } else { + rowmap_t devOutRowmap; + entries_t devOutEntries; + switch (howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, + input.graph.entries, devOutRowmap, + devOutEntries); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_and_merge_graph( + input.graph.row_map, input.graph.entries, devOutRowmap, + devOutEntries); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_and_merge_graph(input.graph.row_map, + input.graph.entries, devOutRowmap, + devOutEntries); + } + outputGraph = graph_t(devOutEntries, devOutRowmap); + } + // Construct output using the output graph, leaving values zero-initialized + output = crsMat_t("Output", outputGraph, ncols); + } else { + // Testing sort_and_merge_matrix + if (doStructInterface) { + switch (howExecSpecified) { + case SortCrsTest::Instance: + output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: + output = KokkosSparse::sort_and_merge_matrix(input); + } + } else { + rowmap_t devOutRowmap; + entries_t devOutEntries; + values_t devOutValues; + switch (howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_and_merge_matrix( + exec_space(), input.graph.row_map, input.graph.entries, + input.values, devOutRowmap, devOutEntries, devOutValues); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + } + // and then construct output from views + output = crsMat_t("Output", nrows, ncols, devOutValues.extent(0), + devOutValues, devOutRowmap, devOutEntries); + } + EXPECT_EQ(output.numRows(), nrows); + EXPECT_EQ(output.numCols(), ncols); + } auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.graph.row_map); auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.graph.entries); auto outValues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values); - // Expect 2 merges to have taken place - std::vector goldRowmap = {0, 3, 3, 4, 5, 8}; - std::vector goldEntries = { - 3, 4, 5, // row 0 - // row 1 has no entries - 6, // row 2 - 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector goldValues = { - 1, 1.5, 1, // row 0 - // row 1 - 2, // row 2 - -3, // row 3 - 0, 3.5, -2.25 // row 4 - }; EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0)); EXPECT_EQ(goldEntries.size(), outEntries.extent(0)); - EXPECT_EQ(goldValues.size(), outValues.extent(0)); - EXPECT_EQ(goldValues.size(), output.nnz()); - for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i)); - for (size_type i = 0; i < output.nnz(); i++) { + if (!justGraph) { + EXPECT_EQ(goldValues.size(), outValues.extent(0)); + EXPECT_EQ(goldValues.size(), output.nnz()); + } + for (size_t i = 0; i < goldRowmap.size(); i++) + EXPECT_EQ(goldRowmap[i], outRowmap(i)); + for (size_t i = 0; i < goldEntries.size(); i++) { EXPECT_EQ(goldEntries[i], outEntries(i)); - EXPECT_EQ(goldValues[i], outValues(i)); + if (!justGraph) { + EXPECT_EQ(goldValues[i], outValues(i)); + } } } TEST_F(TestCategory, common_sort_crsgraph) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, false, doStructInterface); - testSortCRS(100, 100, 2000, false, doStructInterface); - testSortCRS(1000, 1000, 30000, false, doStructInterface); + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + // If using the struct interface (StaticCrsGraph), cannot use ExplicitType + // because the exec space type is determined from the graph. + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; + testSortCRS(10, 10, 20, false, doStructInterface, + howExecSpecified); + testSortCRS(100, 100, 2000, false, doStructInterface, + howExecSpecified); + testSortCRS(1000, 1000, 30000, false, doStructInterface, + howExecSpecified); + } testSortCRSUnmanaged(false, doStructInterface); } } TEST_F(TestCategory, common_sort_crsmatrix) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, true, doStructInterface); - testSortCRS(100, 100, 2000, true, doStructInterface); - testSortCRS(1000, 1000, 30000, true, doStructInterface); + // howExecSpecified: Instance, ExplicitType, ImplicitType + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + // If using the struct interface (CrsMatrix), cannot use ExplicitType + // because the exec space type is determined from the matrix. + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; + testSortCRS(10, 10, 20, true, doStructInterface, + howExecSpecified); + testSortCRS(100, 100, 2000, true, doStructInterface, + howExecSpecified); + testSortCRS(1000, 1000, 30000, true, doStructInterface, + howExecSpecified); + } testSortCRSUnmanaged(true, doStructInterface); } } TEST_F(TestCategory, common_sort_crs_longrows) { - testSortCRS(1, 50000, 10000, false, false); - testSortCRS(1, 50000, 10000, true, false); + // Matrix/graph with one very long row + // Just test this once with graph, and once with matrix + testSortCRS(1, 50000, 10000, false, false, + SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, true, false, + SortCrsTest::ImplicitType); } TEST_F(TestCategory, common_sort_merge_crsmatrix) { - testSortAndMerge(); + for (int testCase = 0; testCase < 5; testCase++) { + for (int doStructInterface = 0; doStructInterface < 2; + doStructInterface++) { + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; + testSortAndMerge(false, howExecSpecified, + doStructInterface, testCase); + } + } + } +} + +TEST_F(TestCategory, common_sort_merge_crsgraph) { + for (int testCase = 0; testCase < 5; testCase++) { + for (int doStructInterface = 0; doStructInterface < 2; + doStructInterface++) { + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; + testSortAndMerge(true, howExecSpecified, + doStructInterface, testCase); + } + } + } } #endif // KOKKOSSPARSE_SORTCRSTEST_HPP diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index 856b888c1a..279f4f89f9 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -19,24 +19,25 @@ namespace Test { template void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { - auto expected_min = ScalarType(1.0); - int64_t expected_nnz = 0; + auto expected_min = ScalarType(1.0); + size_t expected_nnz = 0; RandCsMatrix cm(m, n, min_val, max_val); - for (int64_t i = 0; i < cm.get_nnz(); ++i) + for (size_t i = 0; i < cm.get_nnz(); ++i) ASSERT_GE(cm(i), expected_min) << cm.info; auto map_d = cm.get_map(); auto map = Kokkos::create_mirror_view(map_d); Kokkos::deep_copy(map, map_d); + // Here we treat 'cm' as a Ccs matrix for (int64_t j = 0; j < cm.get_dim1(); ++j) { - int64_t row_len = j < static_cast(m) ? (map(j + 1) - map(j)) : 0; - for (int64_t i = 0; i < row_len; ++i) { - int64_t row_start = j < static_cast(m) ? map(j) : 0; - ASSERT_FLOAT_EQ(cm(row_start + i), cm(expected_nnz + i)) << cm.info; + int64_t col_len = j < static_cast(m) ? (map(j + 1) - map(j)) : 0; + for (int64_t i = 0; i < col_len; ++i) { + int64_t col_start = j < static_cast(m) ? map(j) : 0; + ASSERT_FLOAT_EQ(cm(col_start + i), cm(expected_nnz + i)) << cm.info; } - expected_nnz += row_len; + expected_nnz += col_len; } ASSERT_EQ(cm.get_nnz(), expected_nnz) << cm.info; @@ -45,10 +46,12 @@ void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { ASSERT_EQ(vals.extent(0), cm.get_nnz() + 1) << cm.info; auto row_ids = cm.get_ids(); - ASSERT_EQ(row_ids.extent(0), cm.get_dim1() * cm.get_dim2() + 1) << cm.info; + ASSERT_EQ(row_ids.extent(0), cm.get_nnz()) << cm.info; auto col_map = cm.get_map(); ASSERT_EQ(col_map.extent(0), cm.get_dim1() + 1); + + ASSERT_EQ(map(cm.get_dim1()), expected_nnz) << cm.info; } template diff --git a/sparse/unit_test/Test_Sparse_Utils.hpp b/sparse/unit_test/Test_Sparse_Utils.hpp index 73320e9358..cbd81e9b08 100644 --- a/sparse/unit_test/Test_Sparse_Utils.hpp +++ b/sparse/unit_test/Test_Sparse_Utils.hpp @@ -118,7 +118,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { return false; } - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; diff --git a/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp index 4c445f439f..11830e0224 100644 --- a/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp @@ -75,7 +75,7 @@ int run_block_gauss_seidel_1( GSApplyType apply_type = Test::symmetric, bool skip_symbolic = false, bool skip_numeric = false, size_t shmem_size = 32128, typename mtx_t::value_type omega = - Kokkos::Details::ArithTraits::one()) { + Kokkos::ArithTraits::one()) { typedef typename mtx_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -156,7 +156,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_view_t; typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; @@ -243,7 +243,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; typedef Kokkos::View scalar_view2d_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; @@ -289,8 +289,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, for (lno_t j = 0; j < nv; j++) { sum += solution_host(j, i) * solution_host(j, i); } - initial_norms[i] = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(sum)); + initial_norms[i] = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(sum)); } for (const auto gs_algorithm : params.gs_algorithms) { @@ -322,8 +322,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, scalar_t diff = x_host(r, c) - solution_host(r, c); sum += diff * diff; } - mag_t result_res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(sum)); + mag_t result_res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(sum)); EXPECT_LT(result_res, params.tolerance * initial_norms[c]); } } diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp index b760e7e69c..58a2a18b8a 100644 --- a/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -123,7 +123,7 @@ bool is_same_block_matrix(bsrMat_t output_mat_actual, return false; } - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; eps_type eps = std::is_same::value ? 3e-2 : 5e-7; diff --git a/sparse/unit_test/Test_Sparse_ccs2crs.hpp b/sparse/unit_test/Test_Sparse_ccs2crs.hpp index 902bf41b4f..56972b8a07 100644 --- a/sparse/unit_test/Test_Sparse_ccs2crs.hpp +++ b/sparse/unit_test/Test_Sparse_ccs2crs.hpp @@ -24,6 +24,9 @@ template +CrsType vanilla_coo2crs(size_t m, size_t n, RowType row, ColType col, + DataType data) { + using RowIndexType = typename RowType::value_type; + using ColIndexType = typename ColType::value_type; + using ValueType = typename DataType::value_type; + std::unordered_map *> + umap; + int nnz = 0; + + for (uint64_t i = 0; i < data.extent(0); i++) { + auto r = row(i); + auto c = col(i); + auto d = data(i); + + if (r >= 0 && c >= 0) { + if (umap.find(r) != umap.end()) { // exists + auto my_row = umap.at(r); + if (my_row->find(c) != my_row->end()) + my_row->at(c) += d; + else { + my_row->insert(std::make_pair(c, d)); + nnz++; + } + } else { // create a new row. + auto new_row = new std::unordered_map(); + umap.insert(std::make_pair(r, new_row)); + new_row->insert(std::make_pair(c, d)); + nnz++; + } + } + } + + typename CrsType::row_map_type::non_const_type row_map("vanilla_row_map", + m + 1); + typename CrsType::values_type values("vanilla_values", nnz); + typename CrsType::staticcrsgraph_type::entries_type col_ids("vanilla_col_ids", + nnz); + + typename CrsType::row_map_type::non_const_type::HostMirror row_map_h = + Kokkos::create_mirror_view(row_map); + typename CrsType::values_type::HostMirror values_h = + Kokkos::create_mirror_view(values); + typename CrsType::staticcrsgraph_type::entries_type::HostMirror col_ids_h = + Kokkos::create_mirror_view(col_ids); + + int row_len = 0; + for (uint64_t i = 0; i < m; i++) { + if (umap.find(i) != umap.end()) row_len += umap.at(i)->size(); + row_map_h(i + 1) = row_len; + } + + for (uint64_t i = 0; i < m; i++) { + if (umap.find(i) == umap.end()) // Fully sparse row + continue; + + auto row_start = row_map_h(i); + auto row_end = row_map_h(i + 1); + auto my_row = umap.at(i); + auto iter = my_row->begin(); + for (auto j = row_start; j < row_end; j++, iter++) { + col_ids_h(j) = iter->first; + values_h(j) = iter->second; + } + delete my_row; + } + + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(col_ids, col_ids_h); + Kokkos::deep_copy(values, values_h); + + return CrsType("vanilla_coo2csr", m, n, nnz, values, row_map, col_ids); +} + +template +void check_crs_matrix(CrsType crsMat, RowType row, ColType col, DataType data, + std::string failure_info = "no failure information!") { + using value_type = typename DataType::value_type; + using ats = Kokkos::ArithTraits; + + // Copy coo to host + typename RowType::HostMirror row_h = Kokkos::create_mirror_view(row); + Kokkos::deep_copy(row_h, row); + typename ColType::HostMirror col_h = Kokkos::create_mirror_view(col); + Kokkos::deep_copy(col_h, col); + typename DataType::HostMirror data_h = Kokkos::create_mirror_view(data); + Kokkos::deep_copy(data_h, data); + + auto crsMatRef = vanilla_coo2crs( + crsMat.numRows(), crsMat.numCols(), row_h, col_h, data_h); + + auto crs_col_ids_ref_d = crsMatRef.graph.entries; + auto crs_row_map_ref_d = crsMatRef.graph.row_map; + auto crs_vals_ref_d = crsMatRef.values; + + using ViewTypeCrsColIdsRef = decltype(crs_col_ids_ref_d); + using ViewTypeCrsRowMapRef = decltype(crs_row_map_ref_d); + using ViewTypeCrsValsRef = decltype(crs_vals_ref_d); + + // Copy crs to host + typename ViewTypeCrsColIdsRef::HostMirror crs_col_ids_ref = + Kokkos::create_mirror_view(crs_col_ids_ref_d); + Kokkos::deep_copy(crs_col_ids_ref, crs_col_ids_ref_d); + typename ViewTypeCrsRowMapRef::HostMirror crs_row_map_ref = + Kokkos::create_mirror_view(crs_row_map_ref_d); + Kokkos::deep_copy(crs_row_map_ref, crs_row_map_ref_d); + typename ViewTypeCrsValsRef::HostMirror crs_vals_ref = + Kokkos::create_mirror_view(crs_vals_ref_d); + Kokkos::deep_copy(crs_vals_ref, crs_vals_ref_d); + + auto crs_col_ids_d = crsMat.graph.entries; + auto crs_row_map_d = crsMat.graph.row_map; + auto crs_vals_d = crsMat.values; + + using ViewTypeCrsColIds = decltype(crs_col_ids_d); + using ViewTypeCrsRowMap = decltype(crs_row_map_d); + using ViewTypeCrsVals = decltype(crs_vals_d); + + // Copy crs to host + typename ViewTypeCrsColIds::HostMirror crs_col_ids = + Kokkos::create_mirror_view(crs_col_ids_d); + Kokkos::deep_copy(crs_col_ids, crs_col_ids_d); + typename ViewTypeCrsRowMap::HostMirror crs_row_map = + Kokkos::create_mirror_view(crs_row_map_d); + Kokkos::deep_copy(crs_row_map, crs_row_map_d); + typename ViewTypeCrsVals::HostMirror crs_vals = + Kokkos::create_mirror_view(crs_vals_d); + Kokkos::deep_copy(crs_vals, crs_vals_d); + + Kokkos::fence(); + + ASSERT_EQ(crsMatRef.nnz(), crsMat.nnz()) << failure_info; + + for (int i = 0; i < crsMatRef.numRows(); i++) { + ASSERT_EQ(crs_row_map_ref(i), crs_row_map(i)) + << "crs_row_map_ref(" << i << " = " << crs_row_map_ref(i) << " != " + << "crs_row_map(" << i << " = " << crs_row_map(i) << " -- " + << failure_info; + } + + for (int i = 0; i < crsMatRef.numRows(); ++i) { + auto row_start_ref = crs_row_map_ref(i); + auto row_stop_ref = crs_row_map_ref(i + 1); + auto row_len_ref = row_stop_ref - row_start_ref; + + auto row_start = crs_row_map(i); + auto row_len = crs_row_map(i + 1) - row_start; + + ASSERT_EQ(row_start_ref, row_start); + ASSERT_EQ(row_len_ref, row_len); + + for (auto j = row_start_ref; j < row_stop_ref; ++j) { + // Look for the corresponding col_id + auto col_id_ref = crs_col_ids_ref(j); + std::string fail_msg = "row: " + std::to_string(i) + + ", crs_col_ids_ref(" + std::to_string(j) + + ") = " + std::to_string(col_id_ref); + + auto k = row_start_ref; + for (; k < row_stop_ref; ++k) { + if (crs_col_ids(k) == col_id_ref) break; + } + if (k == row_stop_ref) + FAIL() << fail_msg << " not found in crs_col_ids!" << failure_info; + + // NOTE: ASSERT_EQ doesn't work -- values may be summed in different + // orders We sum at most m x n values. + auto eps = + crsMatRef.numCols() * crsMatRef.numRows() * 10e1 * ats::epsilon(); + EXPECT_NEAR_KK(crs_vals_ref(j), crs_vals(k), eps, + fail_msg + " mismatched values!" + failure_info); + } + } +} + +template +void doCoo2Crs(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + RandCooMat cooMat(m, n, m * n, min_val, + max_val); + auto randRow = cooMat.get_row(); + auto randCol = cooMat.get_col(); + auto randData = cooMat.get_data(); + + std::string failure_info = + "\nBegin arguments for above failure...\n" + cooMat.info + + "scalar: " + std::string(typeid(ScalarType).name()) + "\n" + + "layout: " + std::string(typeid(LayoutType).name()) + "\n" + + "m: " + std::to_string(m) + ", n: " + std::to_string(n) + + "\n...end arguments for above failure.\n"; + + auto crsMat = KokkosSparse::coo2crs(m, n, randRow, randCol, randData); + check_crs_matrix(crsMat, randRow, randCol, randData, failure_info); +} + +template +void doAllScalarsCoo2Crs(size_t m, size_t n, int min, int max) { + doCoo2Crs(m, n, min, max); + doCoo2Crs(m, n, min, max); + doCoo2Crs, LayoutType, ExeSpaceType>(m, n, min, max); + doCoo2Crs, LayoutType, ExeSpaceType>(m, n, min, max); +} + +template +void doAllLayoutsCoo2Crs(size_t m, size_t n, int min, int max) { + doAllScalarsCoo2Crs(m, n, min, max); + doAllScalarsCoo2Crs(m, n, min, max); +} + +template +void doAllCoo2Crs(size_t m, size_t n) { + int min = 1, max = 10; + doAllLayoutsCoo2Crs(m, n, min, max); +} + +TEST_F(TestCategory, sparse_coo2crs) { + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + std::srand(ticks); + + doAllCoo2Crs(0, 0); + + // Square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t dim = (std::rand() % 511) + 1; + doAllCoo2Crs(dim, dim); + } + + // Non-square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t m = (std::rand() % 511) + 1; + size_t n = (std::rand() % 511) + 1; + while (n == m) n = (std::rand() % 511) + 1; + doAllCoo2Crs(m, n); + } + + RandCooMat cooMat(2, 2, 2 * 2, 10, + 10); + auto crsMatrix = KokkosSparse::coo2crs(2, 2, cooMat.get_row(), + cooMat.get_col(), cooMat.get_data()); + auto cooMatrix = KokkosSparse::crs2coo(crsMatrix); + + check_crs_matrix(crsMatrix, cooMatrix.row(), cooMatrix.col(), + cooMatrix.data()); +} + +TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { + int m = 4; + int n = 4; + long long staticRow[16]{0, 1, 3, 2, 3, 2, 2, 2, 0, 0, 0, 1, 2, 0, 3, 0}; + long long staticCol[16]{1, 1, 2, 3, 3, 2, 3, 2, 0, 0, 1, 3, 1, 2, 0, 0}; + float staticData[16]{7.28411, 8.17991, 8.84304, 5.01788, 9.85646, 5.79404, + 8.42014, 1.90238, 8.24195, 4.39955, 3.2637, 5.4546, + 6.51895, 8.09302, 9.36294, 3.44206}; + Kokkos::View row("coo row", 16); + Kokkos::View col("coo col", 16); + Kokkos::View data("coo data", 16); + + typename Kokkos::View::HostMirror row_h = + Kokkos::create_mirror_view(row); + typename Kokkos::View::HostMirror col_h = + Kokkos::create_mirror_view(col); + typename Kokkos::View::HostMirror data_h = + Kokkos::create_mirror_view(data); + for (int i = 0; i < 16; i++) { + row_h(i) = staticRow[i]; + col_h(i) = staticCol[i]; + data_h(i) = staticData[i]; + } + + Kokkos::deep_copy(row, row_h); + Kokkos::deep_copy(col, col_h); + Kokkos::deep_copy(data, data_h); + + // Even partitions with multiple threads + auto crsMatTs4 = KokkosSparse::coo2crs(m, n, row, col, data); + check_crs_matrix(crsMatTs4, row_h, col_h, data_h); + + // Even partitions, single thread, fully sparse row + long long staticRowTs1[16]{0, 3, 0, 2, 2, 3, 0, 3, 2, 0, 0, 0, 0, 3, 3, 0}; + long long staticColTs1[16]{3, 1, 3, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 0, 0, 0}; + float staticDataTs1[16]{6.1355, 6.53989, 8.58559, 6.37476, 4.18964, 2.41146, + 1.82177, 1.4249, 1.52659, 5.50521, 8.0484, 3.98874, + 6.74709, 3.35072, 7.81944, 5.83494}; + for (int i = 0; i < 16; i++) { + row_h(i) = staticRowTs1[i]; + col_h(i) = staticColTs1[i]; + data_h(i) = staticDataTs1[i]; + } + Kokkos::deep_copy(row, row_h); + Kokkos::deep_copy(col, col_h); + Kokkos::deep_copy(data, data_h); + + auto crsMatTs1 = KokkosSparse::coo2crs(m, n, row, col, data); + check_crs_matrix(crsMatTs1, row_h, col_h, data_h); + + // Fully sparse + for (int i = 0; i < 16; i++) { + row_h(i) = -staticRowTs1[i]; + col_h(i) = -staticColTs1[i]; + } + Kokkos::deep_copy(row, row_h); + Kokkos::deep_copy(col, col_h); + + auto crsMatFsTs1 = KokkosSparse::coo2crs(m, n, row, col, data); + check_crs_matrix(crsMatFsTs1, row_h, col_h, data); +} +} // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_crs2ccs.hpp b/sparse/unit_test/Test_Sparse_crs2ccs.hpp index c52ea6dfbe..720c6cd05e 100644 --- a/sparse/unit_test/Test_Sparse_crs2ccs.hpp +++ b/sparse/unit_test/Test_Sparse_crs2ccs.hpp @@ -24,6 +24,9 @@ template +void check_coo_matrix(CrsType crsMatRef, RowType row, ColType col, + DataType data) { + // Copy coo to host + typename RowType::HostMirror row_h = Kokkos::create_mirror_view(row); + Kokkos::deep_copy(row_h, row); + typename ColType::HostMirror col_h = Kokkos::create_mirror_view(col); + Kokkos::deep_copy(col_h, col); + typename DataType::HostMirror data_h = Kokkos::create_mirror_view(data); + Kokkos::deep_copy(data_h, data); + + // printf("coo in:\n"); + // for (unsigned i = 0; i < data_h.extent(0); i++) + // printf("(%lld, %lld, %g)\n", row_h(i), col_h(i), data_h(i)); + + auto crs_col_ids_ref_d = crsMatRef.graph.entries; + auto crs_row_map_ref_d = crsMatRef.graph.row_map; + auto crs_vals_ref_d = crsMatRef.values; + + using ViewTypeCrsColIdsRef = decltype(crs_col_ids_ref_d); + using ViewTypeCrsRowMapRef = decltype(crs_row_map_ref_d); + using ViewTypeCrsValsRef = decltype(crs_vals_ref_d); + + // Copy crs to host + typename ViewTypeCrsColIdsRef::HostMirror crs_col_ids_ref = + Kokkos::create_mirror_view(crs_col_ids_ref_d); + Kokkos::deep_copy(crs_col_ids_ref, crs_col_ids_ref_d); + typename ViewTypeCrsRowMapRef::HostMirror crs_row_map_ref = + Kokkos::create_mirror_view(crs_row_map_ref_d); + Kokkos::deep_copy(crs_row_map_ref, crs_row_map_ref_d); + typename ViewTypeCrsValsRef::HostMirror crs_vals_ref = + Kokkos::create_mirror_view(crs_vals_ref_d); + Kokkos::deep_copy(crs_vals_ref, crs_vals_ref_d); + + Kokkos::fence(); + + ASSERT_EQ(crsMatRef.nnz(), row.extent(0)); + ASSERT_EQ(crsMatRef.nnz(), col.extent(0)); + ASSERT_EQ(crsMatRef.nnz(), data.extent(0)); + + for (decltype(row.extent(0)) idx = 0; idx < row.extent(0); ++idx) { + auto row_id = row_h(idx); + auto col_id = col_h(idx); + auto val = data_h(idx); + std::string fail_msg = "idx - " + std::to_string(idx) + + " row: " + std::to_string(row_id) + + ", col: " + std::to_string(col_id); + + auto row_start_ref = crs_row_map_ref(row_id); + auto row_stop_ref = crs_row_map_ref(row_id + 1); + + auto crs_idx = row_start_ref; + for (; crs_idx < row_stop_ref; crs_idx++) { + if (crs_col_ids_ref(crs_idx) == col_id) { + // crs2coo does a direct copy, no need for an epsilon. + if (crs_vals_ref(crs_idx) == val) break; + } + } + if (crs_idx == row_stop_ref) + FAIL() << fail_msg << " not found in crsMatRef!"; + } +} + +template +void doCrs2Coo(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + using RandCrsMatType = RandCsMatrix; + RandCrsMatType crsMat(m, n, min_val, max_val, m == 0 || n == 0); + + using CrsOT = typename RandCrsMatType::IdViewTypeD::value_type; + using CrsType = + typename KokkosSparse::CrsMatrix; + auto map = crsMat.get_map(); + auto ids = crsMat.get_ids(); + CrsType crsMatrix("doCrs2Coo", crsMat.get_dim1(), crsMat.get_dim2(), + crsMat.get_nnz(), crsMat.get_vals(), map, ids); + + auto cooMat = KokkosSparse::crs2coo(crsMatrix); + check_coo_matrix(crsMatrix, cooMat.row(), cooMat.col(), cooMat.data()); +} + +template +void doAllScalarsCrs2Coo(size_t m, size_t n, int min, int max) { + doCrs2Coo(m, n, min, max); + doCrs2Coo(m, n, min, max); + doCrs2Coo, LayoutType, ExeSpaceType>(m, n, min, max); + doCrs2Coo, LayoutType, ExeSpaceType>(m, n, min, max); +} + +template +void doAllLayoutsCrs2Coo(size_t m, size_t n, int min, int max) { + doAllScalarsCrs2Coo(m, n, min, max); + doAllScalarsCrs2Coo(m, n, min, max); +} + +template +void doAllCrs2Coo(size_t m, size_t n) { + int min = 1, max = 10; + doAllLayoutsCrs2Coo(m, n, min, max); +} + +TEST_F(TestCategory, sparse_crs2coo) { + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + std::srand(ticks); + + // Square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t dim = (std::rand() % 511) + 1; + doAllCrs2Coo(dim, dim); + } + + // Non-square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t m = (std::rand() % 511) + 1; + size_t n = (std::rand() % 511) + 1; + while (n == m) n = (std::rand() % 511) + 1; + doAllCrs2Coo(m, n); + } +} +} // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 7960a1a9bc..358205b713 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -153,7 +153,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; srand(245); lno_t numCols = numRows; crsMat_t input_mat = @@ -177,8 +177,8 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, int apply_count = 3; // test symmetric, forward, backward scalar_view_t x_vector( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); - const scalar_t one = Kokkos::Details::ArithTraits::one(); - const scalar_t zero = Kokkos::Details::ArithTraits::zero(); + const scalar_t one = Kokkos::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; @@ -242,7 +242,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, typedef Kokkos::View scalar_view2d_t; typedef Kokkos::View host_scalar_view2d_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; crsMat_t input_mat = @@ -270,11 +270,11 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, for (lno_t j = 0; j < nv; j++) { sum += solution_x(j, i) * solution_x(j, i); } - initial_norms[i] = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(sum)); + initial_norms[i] = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(sum)); } int apply_count = 3; // test symmetric, forward, backward - const scalar_t zero = Kokkos::Details::ArithTraits::zero(); + const scalar_t zero = Kokkos::ArithTraits::zero(); //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; @@ -289,8 +289,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -312,8 +312,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -332,8 +332,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -350,8 +350,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -361,8 +361,8 @@ template void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { - const scalar_t zero = Kokkos::Details::ArithTraits::zero(); - const scalar_t one = Kokkos::Details::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); + const scalar_t one = Kokkos::ArithTraits::one(); srand(245); typedef typename device::execution_space exec_space; typedef @@ -419,10 +419,9 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, // Copy solution back Kokkos::deep_copy(x, x_host); // Check against gold solution - scalar_t xSq = KokkosBlas::dot(x, x); - scalar_t solnDot = KokkosBlas::dot(x, xgold); - double scaledSolutionDot = - Kokkos::Details::ArithTraits::abs(solnDot / xSq); + scalar_t xSq = KokkosBlas::dot(x, x); + scalar_t solnDot = KokkosBlas::dot(x, xgold); + double scaledSolutionDot = Kokkos::ArithTraits::abs(solnDot / xSq); EXPECT_TRUE(0.99 < scaledSolutionDot); } @@ -533,7 +532,7 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename crsMat_t::index_type::non_const_type entries_view_t; typedef typename crsMat_t::row_map_type::non_const_type rowmap_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; const scalar_t one = Kokkos::ArithTraits::one(); srand(245); std::vector rowmap = {0}; @@ -630,7 +629,7 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; const scalar_t one = Kokkos::ArithTraits::one(); size_type nnz = nnzPerRow * numRows; crsMat_t input_mat = diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 41204c9b4d..f6e4d0bc84 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -192,9 +192,6 @@ void test_mdf() { test_mdf(); \ } -#define NO_TEST_COMPLEX - #include #undef KOKKOSKERNELS_EXECUTE_TEST -#undef NO_TEST_COMPLEX diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 9b99c1000d..4370ebe37e 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -304,7 +304,7 @@ void run_test_par_ilut_precond() { constexpr auto diagDominance = 1; constexpr bool verbose = false; - typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; + size_type nnz = 10 * numRows; auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), diagDominance); diff --git a/sparse/unit_test/Test_Sparse_replaceSumInto.hpp b/sparse/unit_test/Test_Sparse_replaceSumInto.hpp index af61aeb320..f8427dc925 100644 --- a/sparse/unit_test/Test_Sparse_replaceSumInto.hpp +++ b/sparse/unit_test/Test_Sparse_replaceSumInto.hpp @@ -50,7 +50,7 @@ class ModifyEvenNumberedRows { ordinal_type cols[1]; value_type vals[1]; - const value_type ONE = Kokkos::Details::ArithTraits::one(); + const value_type ONE = Kokkos::ArithTraits::one(); const value_type THREE = ONE + ONE + ONE; cols[0] = lclRow; @@ -97,7 +97,7 @@ bool checkWhetherEvenNumberedRowsWereModified(const CrsMatrixType& A, typedef typename CrsMatrixType::value_type SC; typedef typename CrsMatrixType::ordinal_type LO; - const SC ONE = Kokkos::Details::ArithTraits::one(); + const SC ONE = Kokkos::ArithTraits::one(); const SC TWO = ONE + ONE; const SC THREE = ONE + ONE + ONE; @@ -135,7 +135,7 @@ void testOneCase(bool& /*success*/, // Teuchos::FancyOStream& out, std::ostream& out, const CrsMatrixType& A, const bool replace, const bool sorted, const bool atomic) { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; typedef typename CrsMatrixType::value_type value_type; // Teuchos::OSTab tab0 (out); diff --git a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp index 76bbfe37a9..98affff57d 100644 --- a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp +++ b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp @@ -47,7 +47,7 @@ class ModifyEntries { KOKKOS_FUNCTION void operator()(const ordinal_type& lclRow, ordinal_type& numModified) const { - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; typedef typename KAT::mag_type mag_type; const scalar_type ONE = KAT::one(); @@ -171,7 +171,7 @@ void checkWhetherEntriesWereModified( // using Teuchos::RCP; typedef typename CrsMatrixType::value_type value_type; typedef typename CrsMatrixType::ordinal_type ordinal_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; // If debug is false, we capture all output in an // std::ostringstream, and don't print it unless the test fails @@ -281,7 +281,7 @@ void testOneCaseImpl(bool& /*success*/, std::ostream& out, // Restore original values. auto val_h = Kokkos::create_mirror_view(A.values); - const scalar_type ONE = Kokkos::Details::ArithTraits::one(); + const scalar_type ONE = Kokkos::ArithTraits::one(); scalar_type curVal = ONE; for (ordinal_type k = 0; k < A.numCols(); ++k, curVal += ONE) { val_h[k] = curVal; @@ -388,7 +388,7 @@ void testAllSizes(bool& success, typedef typename matrix_type::value_type value_type; typedef typename matrix_type::ordinal_type ordinal_type; typedef typename matrix_type::size_type size_type; - const value_type ONE = Kokkos::Details::ArithTraits::one(); + const value_type ONE = Kokkos::ArithTraits::one(); // Teuchos::OSTab tab0 (out); out << "maxNumEnt: " << maxNumEnt << endl; diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index 4d53b1e126..7e655d4c0c 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -78,12 +78,13 @@ void run_spgemm_noreuse(crsMat_t A, crsMat_t B, crsMat_t &C) { } template -int run_spgemm(crsMat_t A, crsMat_t B, +int run_spgemm(crsMat_t &A, crsMat_t &B, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C, bool testReuse) { typedef typename crsMat_t::size_type size_type; typedef typename crsMat_t::ordinal_type lno_t; typedef typename crsMat_t::value_type scalar_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, @@ -113,7 +114,14 @@ int run_spgemm(crsMat_t A, crsMat_t B, EXPECT_TRUE(sh->is_numeric_called()); if (testReuse) { - // Give A and B completely new random values, and re-run just numeric + // Give A and B completely new random values (changing both the pointer + // and contents), and re-run just numeric. + A.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new A values"), + A.nnz()); + B.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new B values"), + B.nnz()); randomize_matrix_values(A.values); randomize_matrix_values(B.values); KokkosSparse::spgemm_numeric(kh, A, false, B, false, C); @@ -127,7 +135,7 @@ int run_spgemm(crsMat_t A, crsMat_t B, } template -int run_spgemm_old_interface(crsMat_t A, crsMat_t B, +int run_spgemm_old_interface(crsMat_t &A, crsMat_t &B, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &result, bool testReuse) { typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -188,7 +196,14 @@ int run_spgemm_old_interface(crsMat_t A, crsMat_t B, EXPECT_TRUE(sh->is_numeric_called()); if (testReuse) { - // Give A and B completely new random values, and re-run just numeric + // Give A and B completely new random values (changing both the pointer + // and contents), and re-run just numeric. + A.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new A values"), + A.nnz()); + B.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new B values"), + B.nnz()); randomize_matrix_values(A.values); randomize_matrix_values(B.values); KokkosSparse::Experimental::spgemm_numeric( @@ -468,6 +483,58 @@ void test_issue402() { << "SpGEMM still has issue 402 bug; C=AA' is incorrect!\n"; } +template +void test_issue1738() { +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUDA_VERSION >= 11000) && \ + (CUDA_VERSION < 11040) + { + std::cerr + << "TEST SKIPPED: See " + "https://github.com/kokkos/kokkos-kernels/issues/1777 for details." + << std::endl; + return; + } +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL + // Make sure that std::invalid_argument is thrown if you: + // - call numeric where an input matrix's entries have changed. + // - try to reuse an spgemm handle by calling symbolic with new input + // matrices + // This check is only enabled in debug builds. +#ifndef NDEBUG + using crsMat_t = CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space>; + crsMat_t A1 = KokkosSparse::Impl::kk_generate_diag_matrix(100); + crsMat_t B1 = KokkosSparse::Impl::kk_generate_diag_matrix(100); + crsMat_t A2 = KokkosSparse::Impl::kk_generate_diag_matrix(50); + crsMat_t B2 = KokkosSparse::Impl::kk_generate_diag_matrix(50); + { + KernelHandle kh; + kh.create_spgemm_handle(); + crsMat_t C1; + KokkosSparse::spgemm_symbolic(kh, A1, false, B1, false, C1); + KokkosSparse::spgemm_numeric(kh, A1, false, B1, false, C1); + crsMat_t C2; + EXPECT_THROW(KokkosSparse::spgemm_symbolic(kh, A2, false, B2, false, C2), + std::invalid_argument); + } + { + KernelHandle kh; + kh.create_spgemm_handle(); + crsMat_t C1; + KokkosSparse::spgemm_symbolic(kh, A1, false, B1, false, C1); + // Note: A1 is a 100x100 diagonal matrix, so the first entry in the first + // row is 0. Change it to a 1 and make sure spgemm_numeric notices that it + // changed. + Kokkos::deep_copy(Kokkos::subview(A1.graph.entries, 0), 1); + EXPECT_THROW(KokkosSparse::spgemm_numeric(kh, A1, false, B1, false, C1), + std::invalid_argument); + } +#endif +} + #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -513,6 +580,7 @@ void test_issue402() { test_spgemm_symbolic(true, false); \ test_spgemm_symbolic(false, false); \ test_issue402(); \ + test_issue1738(); \ } // test_spgemm(50000, 50000 * 30, 100, 10); diff --git a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp index cfdf13a709..25a5d155a7 100644 --- a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp @@ -165,7 +165,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) { return false; } - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; eps_type eps = std::is_same::value ? 2 * 1e-3 : 1e-7; diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 12065781f1..77cdb1ede1 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -47,10 +47,10 @@ namespace Test { template void run_test_spiluk() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::View RowMapType; + typedef Kokkos::View EntriesType; + typedef Kokkos::View ValuesType; + typedef Kokkos::ArithTraits AT; const size_type nrows = 9; const size_type nnz = 21; @@ -262,6 +262,216 @@ void run_test_spiluk() { } } +template +void run_test_spiluk_streams(int test_algo, int nstreams) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; + using crsMat_t = CrsMatrix; + using AT = Kokkos::ArithTraits; + + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } + } +#endif + if (!run_streams_test) return; + + const size_type nrows = 9; + const size_type nnz = 21; + + std::vector instances; + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); + + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + scalar_t MONE = scalar_t(-1); + + hrow_map(0) = 0; + hrow_map(1) = 3; + hrow_map(2) = 5; + hrow_map(3) = 6; + hrow_map(4) = 9; + hrow_map(5) = 11; + hrow_map(6) = 13; + hrow_map(7) = 15; + hrow_map(8) = 18; + hrow_map(9) = nnz; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 5; + hentries(3) = 1; + hentries(4) = 6; + hentries(5) = 2; + hentries(6) = 0; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 0; + hentries(10) = 4; + hentries(11) = 1; + hentries(12) = 5; + hentries(13) = 2; + hentries(14) = 6; + hentries(15) = 3; + hentries(16) = 4; + hentries(17) = 7; + hentries(18) = 3; + hentries(19) = 4; + hentries(20) = 8; + + hvalues(0) = 10; + hvalues(1) = 0.3; + hvalues(2) = 0.6; + hvalues(3) = 11; + hvalues(4) = 0.7; + hvalues(5) = 12; + hvalues(6) = 5; + hvalues(7) = 13; + hvalues(8) = 1; + hvalues(9) = 4; + hvalues(10) = 14; + hvalues(11) = 3; + hvalues(12) = 15; + hvalues(13) = 7; + hvalues(14) = 16; + hvalues(15) = 6; + hvalues(16) = 5; + hvalues(17) = 17; + hvalues(18) = 2; + hvalues(19) = 2.5; + hvalues(20) = 18; + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); + A_entries_v[i] = EntriesType("A_entries", nnz); + A_values_v[i] = ValuesType("A_values", nnz); + + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + if (test_algo == 0) + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, + 4 * nrows, 4 * nrows); + else if (test_algo == 1) + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, + 4 * nrows, 4 * nrows); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + std::cout << " Stream " << i << ": "; + spiluk_handle->print_algorithm(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i], nstreams); + + Kokkos::fence(); + + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); + } // Done handle creation and spiluk_symbolic on all streams + + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], + A_entries_v[i]); + crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], + L_row_map_v[i], L_entries_v[i]); + crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], + U_row_map_v[i], U_entries_v[i]); + + // Create a reference view e set to all 1's + ValuesType e_one("e_one", nrows); + Kokkos::deep_copy(e_one, 1.0); + + // Create two views for spmv results + ValuesType bb("bb", nrows); + ValuesType bb_tmp("bb_tmp", nrows); + + // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) + KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + + typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + + KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); + KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + + typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + + EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + + kh_v[i].destroy_spiluk_handle(); + } +} + } // namespace Test template (); } +template +void test_spiluk_streams() { + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; + Test::run_test_spiluk_streams(0, 1); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; + Test::run_test_spiluk_streams(0, 2); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; + Test::run_test_spiluk_streams(0, 3); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; + Test::run_test_spiluk_streams(0, 4); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; + Test::run_test_spiluk_streams(1, 1); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; + Test::run_test_spiluk_streams(1, 2); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; + Test::run_test_spiluk_streams(1, 3); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; + Test::run_test_spiluk_streams(1, 4); +} + #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spiluk(); \ + test_spiluk_streams(); \ } #define NO_TEST_COMPLEX diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index d0a6d1464c..9da0733581 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -159,8 +159,8 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv( - crsMat_t input_mat, x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, + const Controls &controls, crsMat_t input_mat, x_vector_type x, + y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, char mode, typename Kokkos::ArithTraits::mag_type max_val) { @@ -183,7 +183,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -422,9 +422,10 @@ Kokkos::complex randomUpperBound>(int mag) { return Kokkos::complex(mag, mag); } -template -void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, bool heavy) { +template +void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -479,8 +480,8 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode, - max_error); + Test::check_spmv(controls, input_mat, input_x, output_y, alpha, beta, + mode, max_error); } } } @@ -490,13 +491,31 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode, - max_error); + Test::check_spmv(controls, input_mat, input_xt, output_yt, alpha, beta, + mode, max_error); } } } } +template +void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, bool heavy) { + { + Controls controls; + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } + + { + Controls controls; + controls.setParameter("algorithm", "native"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } +} + template void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, @@ -899,13 +918,13 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { // check that the controls are flowing down correctly in the spmv kernel template void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { + lno_t row_size_variance, + const Controls &controls = Controls()) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; - using Controls = KokkosKernels::Experimental::Controls; using mag_t = typename Kokkos::ArithTraits::mag_type; constexpr mag_t max_x = static_cast(10); @@ -931,8 +950,6 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, const mag_t max_error = max_y + bandwidth * max_val * max_x; - Controls controls; - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0, max_error); Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0, @@ -941,6 +958,15 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, max_error); } // test_spmv_controls +// test the native algorithm +template +void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance) { + Controls controls; + controls.setParameter("algorithm", "native"); + test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); +} // test_spmv_native + // call it if ordinal int and, scalar float and double are instantiated. template void test_github_issue_101() { @@ -1577,15 +1603,18 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, #define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_spmv(1000, 1000 * 3, 200, 10, true); \ - test_spmv(1000, 1000 * 3, 100, 10, true); \ - test_spmv(1000, 1000 * 20, 100, 5, true); \ - test_spmv(50000, 50000 * 3, 20, 10, \ - false); \ - test_spmv(50000, 50000 * 3, 100, 10, \ - false); \ - test_spmv(10000, 10000 * 2, 100, 5, \ - false); \ + test_spmv_algorithms(1000, 1000 * 3, 200, \ + 10, true); \ + test_spmv_algorithms(1000, 1000 * 3, 100, \ + 10, true); \ + test_spmv_algorithms(1000, 1000 * 20, \ + 100, 5, true); \ + test_spmv_algorithms(50000, 50000 * 3, \ + 20, 10, false); \ + test_spmv_algorithms(50000, 50000 * 3, \ + 100, 10, false); \ + test_spmv_algorithms(10000, 10000 * 2, \ + 100, 5, false); \ test_spmv_controls(10000, 10000 * 20, \ 100, 5); \ } diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index ccbcb21301..b2883c1e91 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -14,563 +14,629 @@ // //@HEADER +/*! \file Test_Sparse_spmv_bsr.hpp + + Test the following 768 combos for at least a few matcies. + + Algorithms Alpha Beta Block Sizes Modes + (none) 0 0 1 N + native x 1 x 1 x 2 x T + experimental_bsr_tc -1 -1 5 C + 3.7 -1.5 9 H + + There are also a subset of tests on larger matrices +*/ + #include +#include +#include + #include #include -#include -#include "KokkosSparse_spmv.hpp" -#include "KokkosSparse_BsrMatrix.hpp" -#include "KokkosSparse_CrsMatrix.hpp" #include #include #include #include - #include "KokkosKernels_Controls.hpp" #include "KokkosKernels_default_types.hpp" -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_crs_to_bsr_impl.hpp" +#include "KokkosSparse_bsr_to_crs_impl.hpp" +#include "KokkosSparse_Utils.hpp" + +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; -namespace Test_Bsr { +namespace Test_Spmv_Bsr { -/// Random generator -template -inline Scalar random() { - auto const max = static_cast(RAND_MAX) + static_cast(1); - return static_cast(std::rand()) / max; +/*! \brief Maximum value used to fill A */ +template +constexpr T max_a() { + T discard, maxVal; + KokkosKernels::Impl::getRandomBounds(10.0, discard, maxVal); + return maxVal; } -template -inline void set_random_value(Scalar &v) { - v = random(); +/*! \brief Maximum value used to fill X */ +template +constexpr T max_x() { + T discard, maxVal; + KokkosKernels::Impl::getRandomBounds(10.0, discard, maxVal); + return maxVal; } -template -inline void set_random_value(Kokkos::complex &v) { - Scalar vre = random(); - Scalar vim = random(); - v = Kokkos::complex(vre, vim); +/*! \brief Maximum value used to fill Y */ +template +constexpr T max_y() { + T discard, maxVal; + KokkosKernels::Impl::getRandomBounds(10.0, discard, maxVal); + return maxVal; } -template -inline void set_random_value(std::complex &v) { - Scalar vre = random(); - Scalar vim = random(); - v = std::complex(vre, vim); +/*! \brief whether the mode transposes the matrix*/ +inline bool mode_is_transpose(const char *mode) { + return mode[0] == 'T' || mode[0] == 'H'; } -/// \brief Routine to make CRS-style entries of the block matrix -/// -/// \tparam scalar_t Template type for the numerical values -/// \param mat_b1 Sparse matrix whose graph will be used -/// \param blockSize Block size for each entries -/// \param mat_rowmap[out] CRS-style row map for the block matrix -/// \param mat_colidx[out] CRS-style column entries for the block matrix -/// \param mat_val[out] Numerical (random) values -template -void make_block_entries( - const KokkosSparse::CrsMatrix &mat_b1, - int blockSize, rowmap_type &mat_rowmap, colidx_type &mat_colidx, - values_type &mat_val) { - size_t nnz = static_cast(blockSize) * static_cast(blockSize) * - mat_b1.nnz(); - - for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); - - // - // Create graph for CrsMatrix - // - - for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const size_type jbeg = mat_b1.graph.row_map(ir); - const size_type jend = mat_b1.graph.row_map(ir + 1); - for (lno_t ib = 0; ib < blockSize; ++ib) { - const lno_t my_row = ir * blockSize + ib; - mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (size_type ijk = jbeg; ijk < jend; ++ijk) { - const auto col0 = mat_b1.graph.entries(ijk); - for (lno_t jb = 0; jb < blockSize; ++jb) { - mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = - col0 * blockSize + jb; - } - } - } - } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) +/*! \brief 0x0 matrix */ +template +Bsr bsr_corner_case_0_by_0(const int blockSize) { + return Bsr("empty", 0, 0, 0, nullptr, nullptr, nullptr, blockSize); } -/// \brief Driver routine for checking BsrMatrix times vector -template -void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, - const lno_t bMax, int &num_errors) { - // The mat_structure view is used to generate a matrix using - // finite difference (FD) or finite element (FE) discretization - // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); - mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction - mat_structure(0, 1) = 0; // Add BC to the left - mat_structure(0, 2) = 0; // Add BC to the right - mat_structure(1, 0) = 7; // Request 7 grid point in 'y' direction - mat_structure(1, 1) = 0; // Add BC to the bottom - mat_structure(1, 2) = 0; // Add BC to the top - mat_structure(2, 0) = 9; // Request 9 grid point in 'z' direction - mat_structure(2, 1) = 0; // Add BC to the bottom - mat_structure(2, 2) = 0; // Add BC to the top - - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename KokkosSparse::CrsMatrix - h_crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - - h_crsMat_t mat_b1 = - Test::generate_structured_matrix3D("FD", mat_structure); - - num_errors = 0; - for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { - // - // Fill blocks with random values - // - - lno_t nRow = blockSize * mat_b1.numRows(); - lno_t nCol = blockSize * mat_b1.numCols(); - size_type nnz = static_cast(blockSize) * - static_cast(blockSize) * mat_b1.nnz(); - - Kokkos::View d_rowmap("crsmatrix", nRow + 1); - auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); - - Kokkos::View d_colidx("crsmatrix", nnz); - auto h_colidx = Kokkos::create_mirror_view(d_colidx); - - Kokkos::View d_matval("crsmatrix", nnz); - auto h_matval = Kokkos::create_mirror_view(d_matval); - - // Create the entries - make_block_entries(mat_b1, blockSize, h_rowmap, - h_colidx, h_matval); - - Kokkos::deep_copy(d_matval, h_matval); - Kokkos::deep_copy(d_colidx, h_colidx); - Kokkos::deep_copy(d_rowmap, h_rowmap); - - // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, - d_colidx); - - x_vector_type xref("new_right_hand_side", nRow); - auto h_xref = Kokkos::create_mirror_view(xref); - for (lno_t ir = 0; ir < nRow; ++ir) { - set_random_value(h_xref(ir)); - } - Kokkos::deep_copy(xref, h_xref); +/*! \brief 0x1 matrix */ +template +Bsr bsr_corner_case_0_by_1(const int blockSize) { + return Bsr("empty", 0, blockSize, 0, nullptr, nullptr, nullptr, blockSize); +} - y_vector_type y0("y_init", nRow); - auto h_y0 = Kokkos::create_mirror_view(y0); - for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir)); - Kokkos::deep_copy(y0, h_y0); +/*! \brief 1x0 matrix */ +template +Bsr bsr_corner_case_1_by_0(const int blockSize) { + return Bsr("empty", blockSize, 0, 0, nullptr, nullptr, nullptr, blockSize); +} + +template +Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using size_type = typename Bsr::non_const_size_type; + using Crs = + KokkosSparse::CrsMatrix; + using Graph = typename Crs::staticcrsgraph_type; + + // construct a random Crs Matrix + Test::RandCsMatrix + rcs(blockRows, blockCols, scalar_type(0), max_a()); + + const auto colids = Kokkos::subview( + rcs.get_ids(), Kokkos::make_pair(size_t(0), rcs.get_nnz())); + const auto vals = Kokkos::subview( + rcs.get_vals(), Kokkos::make_pair(size_t(0), rcs.get_nnz())); + Graph graph(colids, rcs.get_map()); + Crs crs("crs", blockCols, vals, graph); + + // expand to Bsr matrix + return KokkosSparse::Impl::expand_crs_to_bsr(crs, blockSize); +} - y_vector_type ycrs("crs_product_result", nRow); - auto h_ycrs = Kokkos::create_mirror_view(ycrs); - for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); - Kokkos::deep_copy(ycrs, h_ycrs); +/*! \brief reference SpMV is the KokkosSparse::spmv on the equivalent point + * matrix + */ +template +void reference_spmv(const char *mode, const Alpha &alpha, const Bsr &a, + const XVector &x, const Beta &beta, const YVector &y) { + using Crs = KokkosSparse::CrsMatrix< + typename Bsr::non_const_value_type, typename Bsr::non_const_ordinal_type, + typename Bsr::device_type, void, typename Bsr::non_const_size_type>; + const Crs crs = KokkosSparse::Impl::bsr_to_crs(a); + + KokkosSparse::spmv(mode, alpha, crs, x, beta, y); +} - // - // Make reference computation with a CrsMatrix variable - // +/*! \brief test a specific spmv + +*/ +template +void test_spmv(const char *alg, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const XVector &x, + const YVector &y) { + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + // generate expected result from reference implementation + YVector yExp("yExp", y.extent(0)); + Kokkos::deep_copy(yExp, y); + reference_spmv(mode, alpha, a, x, beta, yExp); + + // scratch space for actual value (don't modify input) + YVector yAct("yAct", y.extent(0)); + Kokkos::deep_copy(yAct, y); + + if (alg) { KokkosKernels::Experimental::Controls controls; - // Use the native implementation since the CUDA 11.2.2 spmv implementation - // is not matching the bsr spmv test tolerance when OFFSET is int. - // See https://github.com/kokkos/kokkos-kernels/issues/1586 -#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (11200 <= CUSPARSE_VERSION) - controls.setParameter("algorithm", "native"); -#endif - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); - - y_vector_type ybsr("bsr_product_result", nRow); - auto h_ybsr = Kokkos::create_mirror_view(ybsr); - for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); - Kokkos::deep_copy(ybsr, h_ybsr); - - // Create the BsrMatrix for the check test - KokkosSparse::Experimental::BsrMatrix - Absr(Acrs, blockSize); - - // - // Make computation with the BsrMatrix format - // - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr); - - // - // Compare the two products - // - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; - - const mag_type zero_mag = Kokkos::ArithTraits::zero(); - mag_type error = zero_mag, maxNorm = zero_mag; - - Kokkos::deep_copy(h_ycrs, ycrs); - Kokkos::deep_copy(h_ybsr, ybsr); - for (lno_t ir = 0; ir < nRow; ++ir) { - error = std::max(error, KATS::abs(h_ycrs(ir) - h_ybsr(ir))); - maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir))); - } + controls.setParameter("algorithm", alg); + KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + } else { + KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); + } + + // compare yExp and yAct + auto hyExp = Kokkos::create_mirror_view(yExp); + auto hyAct = Kokkos::create_mirror_view(yAct); + Kokkos::deep_copy(hyExp, yExp); + Kokkos::deep_copy(hyAct, yAct); + + // max nnz per row is used for the tolerance + // for a transposed computation, need to transpose the matrix before + // seeing which rows are longest + size_t maxNnzPerRow; + if (mode_is_transpose(mode)) { + auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); + maxNnzPerRow = + at.blockDim() * + KokkosSparse::Impl::graph_max_degree( + at.graph.row_map); + } else { + maxNnzPerRow = + a.blockDim() * + KokkosSparse::Impl::graph_max_degree( + a.graph.row_map); + } + + /* assume that any floating-point op may introduce eps() error + scaling y is one op + dot product of x is two ops per entry (mul and add) - mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); - if ((tmps > zero_mag) && (maxNorm == zero_mag)) { - std::cout << " BSR - SpMV times MV >> blockSize " << blockSize - << " maxNorm " << maxNorm << " error " << error << " alpha " - << alpha << " beta " << beta << "\n"; - num_errors += 1; + 10x means same order of magnitude + */ + const mag_type tolerance = + KATS::eps() * KATS::abs(beta) * KATS::abs(max_y()) + + 10 * KATS::eps() * maxNnzPerRow * KATS::abs(alpha) * + KATS::abs(max_a()) * KATS::abs(max_x()); + + std::vector errIdx; + + for (ordinal_type i = 0; i < ordinal_type(hyAct.extent(0)); ++i) { + if (KATS::abs(hyExp(i) - hyAct(i)) > tolerance) { + errIdx.push_back(i); } + } - // - // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row - // - const mag_type tol = ((static_cast(nnz) / nRow) + 1) * - Kokkos::ArithTraits::epsilon(); - if (error > tol * maxNorm) { - std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio " - << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm - << " alpha " << alpha << " beta " << beta << "\n"; - num_errors += 1; + if (!errIdx.empty()) { + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMV failure!" + << std::endl; + std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "mode: " << mode << std::endl; + std::cerr << "A: " << a.numRows() << "x" << a.numCols() + << std::endl; + std::cerr << "A blockdim: " << a.blockDim() << std::endl; + std::cerr << "alpha: " << alpha << std::endl; + std::cerr << "beta: " << beta << std::endl; + std::cerr << "maxNnzPerRow: " << maxNnzPerRow << std::endl; + std::cerr << "First 100 errors:" << std::endl; + std::cerr << "y\texp\tact\terr\ttol" << std::endl; + std::cerr << "-\t---\t---\t---\t---" << std::endl; + for (size_t i = 0; i < 100 && i < errIdx.size(); ++i) { + size_t ei = errIdx[i]; + // clang-format off + std::cerr << ei + << "\t" << hyExp(ei) + << "\t" << hyAct(ei) + << "\t" << KATS::abs(hyExp(ei) - hyAct(ei)) + << "\t" << tolerance + << std::endl; + // clang-format on } + } + + EXPECT_TRUE(errIdx.empty()); +} + +template +struct VectorTypeFor { + using type = Kokkos::View; +}; + +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_corner_case_0_by_0(const char * /*mode*/, const int blockSize) { + using vector_type = typename VectorTypeFor::type; + Bsr a = bsr_corner_case_0_by_0(blockSize); + vector_type x("x", 0); + vector_type y("y", 0); + return std::make_tuple(a, x, y); +} + +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_corner_case_0_by_1(const char *mode, const int blockSize) { + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + Bsr a = bsr_corner_case_0_by_1(blockSize); + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(a, x, y); +} + +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_corner_case_1_by_0(const char *mode, const int blockSize) { + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + Bsr a = bsr_corner_case_1_by_0(blockSize); + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(a, x, y); +} + +/*! \brief + +*/ +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_random(const char *mode, const int blockSize, const int blockRows, + const int blockCols) { + using scalar_type = typename Bsr::non_const_value_type; + + // expand to Bsr matrix + Bsr a = bsr_random(blockSize, blockRows, blockCols); + + // generate some random vectors + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); - } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) + return std::make_tuple(a, x, y); } -/// \brief Driver routine for checking BsrMatrix times multiple vector -template -void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, - const lno_t bMax, int &num_errors) { - // The mat_structure view is used to generate a matrix using - // finite difference (FD) or finite element (FE) discretization - // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); - mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction - mat_structure(0, 1) = 0; // Add BC to the left - mat_structure(0, 2) = 0; // Add BC to the right - mat_structure(1, 0) = 5; // Request 11 grid point in 'y' direction - mat_structure(1, 1) = 0; // Add BC to the bottom - mat_structure(1, 2) = 0; // Add BC to the top - mat_structure(2, 0) = 9; // Request 13 grid point in 'y' direction - mat_structure(2, 1) = 0; // Add BC to the bottom - mat_structure(2, 2) = 0; // Add BC to the top - - typedef typename KokkosSparse::CrsMatrix - h_crsMat_t; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef Kokkos::View block_vector_t; - - h_crsMat_t mat_b1 = - Test::generate_structured_matrix3D("FD", mat_structure); - - num_errors = 0; - const int nrhs = 5; - - for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { - // - // Fill blocks with random values - // - - lno_t nRow = blockSize * mat_b1.numRows(); - lno_t nCol = blockSize * mat_b1.numCols(); - size_type nnz = static_cast(blockSize) * - static_cast(blockSize) * mat_b1.nnz(); - - Kokkos::View d_rowmap("crsmatrix", nRow + 1); - auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); - - Kokkos::View d_colidx("crsmatrix", nnz); - auto h_colidx = Kokkos::create_mirror_view(d_colidx); - - Kokkos::View d_matval("crsmatrix", nnz); - auto h_matval = Kokkos::create_mirror_view(d_matval); - - // Create the entries - make_block_entries(mat_b1, blockSize, h_rowmap, - h_colidx, h_matval); - - Kokkos::deep_copy(d_matval, h_matval); - Kokkos::deep_copy(d_colidx, h_colidx); - Kokkos::deep_copy(d_rowmap, h_rowmap); - - // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, - d_colidx); - - block_vector_t xref("new_right_hand_side", nRow, nrhs); - auto h_xref = Kokkos::create_mirror_view(xref); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_xref(ir, jc)); - Kokkos::deep_copy(xref, h_xref); - - block_vector_t y0("y_init", nRow, nrhs); - auto h_y0 = Kokkos::create_mirror_view(y0); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); - Kokkos::deep_copy(y0, h_y0); - - block_vector_t ycrs("crs_product_result", nRow, nrhs); - auto h_ycrs = Kokkos::create_mirror_view(ycrs); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); - Kokkos::deep_copy(ycrs, h_ycrs); - - // - // Compute the reference product with a CrsMatrix variable - // - KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - - block_vector_t ybsr("bsr_product_result", nRow, nrhs); - auto h_ybsr = Kokkos::create_mirror_view(ybsr); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); - Kokkos::deep_copy(ybsr, h_ybsr); - - // Create the BsrMatrix for the check test - KokkosSparse::Experimental::BsrMatrix - Absr(Acrs, blockSize); - - // - // Compute the product with the BsrMatrix format - // - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr); - - Kokkos::deep_copy(h_ycrs, ycrs); - Kokkos::deep_copy(h_ybsr, ybsr); - - // - // Compare the two products - // - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; - - const mag_type zero_mag = Kokkos::ArithTraits::zero(); - mag_type error = zero_mag, maxNorm = zero_mag; - for (int jc = 0; jc < nrhs; ++jc) { - for (int ir = 0; ir < nRow; ++ir) { - error = std::max(error, - KATS::abs(h_ycrs(ir, jc) - h_ybsr(ir, jc))); - maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir, jc))); +/*! \brief create random x and y multivectors for a given matrix and spmv mode + */ +template +auto random_vecs_for_spmv(const char *mode, const Bsr &a) { + using scalar_type = typename Bsr::non_const_value_type; + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(x, y); +} + +/*! \brief test all combos of the provided matrix + */ +template +void test_spmv_combos(const char *mode, const Bsr &a) { + using scalar_type = typename Bsr::non_const_value_type; + + auto [x, y] = random_vecs_for_spmv(mode, a); + + for (auto alg : {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (scalar_type alpha : + {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { + for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), + scalar_type(-1.5)}) { + test_spmv(alg, mode, alpha, beta, a, x, y); } } + } +} - mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); - if ((tmps > zero_mag) && (maxNorm == zero_mag)) { - std::cout << " BSR - SpMV times MV >> blockSize " << blockSize - << " maxNorm " << maxNorm << " error " << error << " alpha " - << alpha << " beta " << beta << "\n"; - num_errors += 1; +/*! \brief test all combos of all matrices with different block sizes + */ +template +void test_spmv_corner_cases() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spmv_combos(mode, bsr_corner_case_0_by_0(bs)); + test_spmv_combos(mode, bsr_corner_case_0_by_1(bs)); + test_spmv_combos(mode, bsr_corner_case_1_by_0(bs)); } + } +} - const mag_type tol = ((static_cast(nnz) / nRow) + 1) * - Kokkos::ArithTraits::epsilon(); - if (error > tol * maxNorm) { - std::cout << " BSR - SpMV times MV >> blockSize " << blockSize - << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " - << maxNorm << " alpha " << alpha << " beta " << beta << "\n"; - num_errors += 1; +template +void test_spmv_random() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spmv_combos(mode, bsr_random(bs, 10, 10)); + test_spmv_combos(mode, bsr_random(bs, 10, 50)); + test_spmv_combos(mode, bsr_random(bs, 50, 10)); } + } - } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) + // test a tougher case on a big matrix + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + for (auto mode : {"N", "T"}) { + test_spmv_combos(mode, + bsr_random(blockSizePrime, smallPrime, largePrime)); + } } -} // namespace Test_Bsr - -template -void testSpMVBsrMatrix() { - // - // Check a few corner cases - // - - // 0 x 0 case - { - typedef - typename KokkosSparse::Experimental::BsrMatrix - bsrMat_t; - bsrMat_t Absr("empty", 0, 0, 0, nullptr, nullptr, nullptr, 1); - typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - x_vector_type x("corner-case-x", Absr.numCols()); - y_vector_type y("corner-case-y", Absr.numRows()); - Kokkos::deep_copy(y, static_cast(0)); - scalar_t alpha = static_cast(1); - scalar_t beta = static_cast(1); - const char fOp = 'N'; - int num_errors = 0; - try { - KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); - Kokkos::fence(); - } catch (std::exception &e) { - num_errors += 1; - std::cout << e.what(); - } - EXPECT_TRUE(num_errors == 0); +template +void test_spmv() { + test_spmv_corner_cases(); + test_spmv_random(); +} + +// ---------------------------------------------------------------------------- +// Multivector +// ---------------------------------------------------------------------------- + +template +void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const XVector &x, + const YVector &y) { + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + // generate expected result from reference implementation + YVector yExp("yExp", y.extent(0), y.extent(1)); + Kokkos::deep_copy(yExp, y); + reference_spmv(mode, alpha, a, x, beta, yExp); + + // scratch space for actual value (don't modify input) + YVector yAct("yAct", y.extent(0), y.extent(1)); + Kokkos::deep_copy(yAct, y); + + if (alg) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", alg); + KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + } else { + KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); + } + + // compare yExp and yAct + auto hyExp = Kokkos::create_mirror_view(yExp); + auto hyAct = Kokkos::create_mirror_view(yAct); + Kokkos::deep_copy(hyExp, yExp); + Kokkos::deep_copy(hyAct, yAct); + + // max nnz per row is used for the tolerance + // for a transposed computation, need to transpose the matrix before + // seeing which rows are longest + size_t maxNnzPerRow; + if (mode_is_transpose(mode)) { + auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); + maxNnzPerRow = + at.blockDim() * + KokkosSparse::Impl::graph_max_degree( + at.graph.row_map); + } else { + maxNnzPerRow = + a.blockDim() * + KokkosSparse::Impl::graph_max_degree( + a.graph.row_map); } - // 0 x 1 case - { - typedef - typename KokkosSparse::Experimental::BsrMatrix - bsrMat_t; - bsrMat_t Absr("empty", 0, 1, 0, nullptr, nullptr, nullptr, 1); - typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - x_vector_type x("corner-case-x", Absr.numCols()); - y_vector_type y("corner-case-y", Absr.numRows()); - Kokkos::deep_copy(y, static_cast(0)); - scalar_t alpha = static_cast(1); - scalar_t beta = static_cast(1); - const char fOp = 'N'; - int num_errors = 0; - try { - KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); - Kokkos::fence(); - } catch (std::exception &e) { - num_errors += 1; - std::cout << e.what(); + /* assume that any floating-point op may introduce eps() error + scaling y is one op + dot product of x is two ops per entry (mul and add) + */ + const mag_type tolerance = + KATS::eps() * KATS::abs(beta) * KATS::abs(max_y()) + + 10 * KATS::eps() * maxNnzPerRow * KATS::abs(alpha) * + KATS::abs(max_a()) * KATS::abs(max_x()); + + std::vector> errIdx; + + for (ordinal_type i = 0; i < ordinal_type(hyAct.extent(0)); ++i) { + for (ordinal_type j = 0; j < ordinal_type(hyAct.extent(1)); ++j) { + if (KATS::abs(hyExp(i, j) - hyAct(i, j)) > tolerance) { + errIdx.push_back({i, j}); + } } - EXPECT_TRUE(num_errors == 0); } - // 1 x 0 case - { - typedef - typename KokkosSparse::Experimental::BsrMatrix - bsrMat_t; - bsrMat_t Absr("empty", 1, 0, 0, nullptr, nullptr, nullptr, 1); - typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - x_vector_type x("corner-case-x", Absr.numCols()); - y_vector_type y("corner-case-y", Absr.numRows()); - Kokkos::deep_copy(y, static_cast(0)); - scalar_t alpha = static_cast(1); - scalar_t beta = static_cast(1); - const char fOp = 'N'; - int num_errors = 0; - try { - KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); - Kokkos::fence(); - } catch (std::exception &e) { - num_errors += 1; - std::cout << e.what(); + if (!errIdx.empty()) { + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMMV failure!" + << std::endl; + std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "mode: " << mode << std::endl; + std::cerr << "A: " << a.numRows() << "x" << a.numCols() + << std::endl; + std::cerr << "A blockdim: " << a.blockDim() << std::endl; + std::cerr << "alpha: " << alpha << std::endl; + std::cerr << "beta: " << beta << std::endl; + std::cerr << "maxNnzPerRow: " << maxNnzPerRow << std::endl; + std::cerr << "First 100 errors:" << std::endl; + std::cerr << "i\tj\texp\tact\terr\ttol" << std::endl; + std::cerr << "-\t-\t---\t---\t---\t---" << std::endl; + for (size_t e = 0; e < 100 && e < errIdx.size(); ++e) { + auto ij = errIdx[e]; + auto i = ij.first; + auto j = ij.second; + // clang-format off + std::cerr << i << "\t" << j + << "\t" << hyExp(i,j) + << "\t" << hyAct(i,j) + << "\t" << KATS::abs(hyExp(i,j) - hyAct(i,j)) + << "\t" << tolerance + << std::endl; + // clang-format on } - EXPECT_TRUE(num_errors == 0); } - // - // Test for the operation y <- alpha * Op(A) * x + beta * y - // - - // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = - // A^H - std::vector modes = {'N', 'C', 'T', 'H'}; - - // Define a set of pairs (alpha, beta) - std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, - 0.0, 1.0, 3.1, -2.5}; - - // - // Set the largest block size for the block matrix - // The code will create matrices with block sizes 1, .., bMax - // - constexpr lno_t bMax = 13; - - // - //--- Test single vector case - // - for (const auto mode : modes) { - int num_errors = 0; - for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { - auto alpha_s = static_cast(testAlphaBeta[ii]); - auto beta_s = static_cast(testAlphaBeta[ii + 1]); - num_errors = 0; - Test_Bsr::check_bsrm_times_v( - &mode, alpha_s, beta_s, bMax, num_errors); - if (num_errors > 0) { - std::cout << "KokkosSparse::Test::spmv_bsr: " << num_errors - << " errors of %i with params: " << bMax << " " << mode << " " - << Kokkos::ArithTraits::abs(alpha_s) << " " - << Kokkos::ArithTraits::abs(beta_s) << std::endl; + EXPECT_TRUE(errIdx.empty()); +} + +template +struct MultiVectorTypeFor { + using type = Kokkos::View; +}; + +/*! \brief create random x and y multivectors for a given matrix and spmv mode + */ +template +auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, + const size_t numVecs) { + using scalar_type = typename Bsr::non_const_value_type; + using vector_type = typename MultiVectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx, numVecs); + vector_type y("y", ny, numVecs); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(x, y); +} + +template +void test_spm_mv_combos(const char *mode, const Bsr &a) { + using scalar_type = typename Bsr::non_const_value_type; + + for (size_t numVecs : {1, 2, 7}) { // num multivecs + auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); + for (auto alg : + {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), + scalar_type(3.7)}) { + for (scalar_type beta : {scalar_type(0), scalar_type(1), + scalar_type(-1), scalar_type(-1.5)}) { + test_spm_mv(alg, mode, alpha, beta, a, x, y); + } } - EXPECT_TRUE(num_errors == 0); } } } -template -void testBsrMatrix_SpM_MV() { - // - // Test for the operation Y <- alpha * Op(A) * X + beta * Y - // - - // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = - // A^H - std::vector modes = {'N', 'C', 'T', 'H'}; - - // Define a set of pairs (alpha, beta) - std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, - 0.0, 1.0, 3.1, -2.5}; - - // - // Set the largest block size for the block matrix - // The code will create matrices with block sizes 1, .., bMax - // - const lno_t bMax = 13; - - //--- Test multiple vector case - for (auto mode : modes) { - int num_errors = 0; - for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { - auto alpha_s = static_cast(testAlphaBeta[ii]); - auto beta_s = static_cast(testAlphaBeta[ii + 1]); - num_errors = 0; - Test_Bsr::check_bsrm_times_mv( - &mode, alpha_s, beta_s, bMax, num_errors); - if (num_errors > 0) { - std::cout << "KokkosSparse::Test::spm_mv_bsr: " << num_errors - << " errors of " << bMax << " with params: " << mode << " " - << Kokkos::ArithTraits::abs(alpha_s) << " " - << Kokkos::ArithTraits::abs(beta_s) << std::endl; - } - EXPECT_TRUE(num_errors == 0); +/*! \brief test all combos of all matrices with different block sizes + */ +template +void test_spm_mv_corner_cases() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spm_mv_combos(mode, bsr_corner_case_0_by_0(bs)); + test_spm_mv_combos(mode, bsr_corner_case_0_by_1(bs)); + test_spm_mv_combos(mode, bsr_corner_case_1_by_0(bs)); } } } +template +void test_spm_mv_random() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + // thoroughly test smaller matrices + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spm_mv_combos(mode, bsr_random(bs, 10, 10)); + test_spm_mv_combos(mode, bsr_random(bs, 10, 50)); + test_spm_mv_combos(mode, bsr_random(bs, 50, 10)); + } + } + + // test a tougher case on a big matrix + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + for (auto mode : {"N", "T"}) { + test_spm_mv_combos( + mode, bsr_random(blockSizePrime, smallPrime, largePrime)); + } +} + +template +void test_spm_mv() { + test_spm_mv_corner_cases(); + test_spm_mv_random(); +} + +} // namespace Test_Spmv_Bsr + ////////////////////////// -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - testSpMVBsrMatrix(); \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##bsr_spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + Test_Spmv_Bsr::test_spmv(); \ } #include @@ -579,11 +645,12 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ - testBsrMatrix_SpM_MV(); \ +#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bsr_spmmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + Test_Spmv_Bsr::test_spm_mv(); \ } #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 8b3e64ecae..1a4c78e08e 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1049,6 +1049,259 @@ void run_test_sptrsv() { } } +template +void run_test_sptrsv_streams(int test_algo, int nstreams) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; + using crsMat_t = CrsMatrix; + + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } + } +#endif + if (!run_streams_test) return; + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + + const size_type nrows = 5; + const size_type nnz = 10; + + std::vector instances; + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else // (nstreams == 4) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector row_map_v(nstreams); + std::vector entries_v(nstreams); + std::vector values_v(nstreams); + std::vector rhs_v(nstreams); + std::vector lhs_v(nstreams); + + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); + + // Upper tri + { + hrow_map(0) = 0; + hrow_map(1) = 2; + hrow_map(2) = 4; + hrow_map(3) = 7; + hrow_map(4) = 9; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 1; + hentries(3) = 4; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 4; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + for (int i = 0; i < nstreams; i++) { + // Allocate U + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = false; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy( + 0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + if (sum != lhs_v[i].extent(0)) { + std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; + kh_v[i].get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + + kh_v[i].destroy_sptrsv_handle(); + } + } + + // Lower tri + { + hrow_map(0) = 0; + hrow_map(1) = 1; + hrow_map(2) = 2; + hrow_map(3) = 4; + hrow_map(4) = 6; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 1; + hentries(2) = 0; + hentries(3) = 2; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 1; + hentries(7) = 2; + hentries(8) = 3; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + for (int i = 0; i < nstreams; i++) { + // Allocate L + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = true; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy( + 0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + if (sum != lhs_v[i].extent(0)) { + std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; + kh_v[i].get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + + kh_v[i].destroy_sptrsv_handle(); + } + } +} + } // namespace Test template (); } +template +void test_sptrsv_streams() { + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; + Test::run_test_sptrsv_streams(0, 1); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 2); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 3); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 4); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; + Test::run_test_sptrsv_streams(1, 1); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 2); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 3); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 4); + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + if (std::is_same::value && + std::is_same::value) { + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 1 stream" << std::endl; + Test::run_test_sptrsv_streams(2, 1); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 2); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 3); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 4); + } +#endif +} + #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_sptrsv(); \ + test_sptrsv_streams(); \ } #include diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index 713c201a8f..e3312c0a41 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -29,26 +29,28 @@ struct Parameters { int multi_color_scale; int shmemsize; int team_size; - int use_dynamic_scheduling; - int verbose; + bool use_dynamic_scheduling; + bool verbose; int spgemm_step; int vector_size; - int check_output; + bool check_output; int mkl_sort_option; int mkl_keep_output; - int calculate_read_write_cost; - char *coloring_input_file; - char *coloring_output_file; + bool calculate_read_write_cost; + std::string coloring_input_file; + std::string coloring_output_file; int minhashscale; int use_threads; int use_openmp; int use_cuda; int use_hip; + int use_sycl; + int use_openmptarget; int use_serial; int a_mem_space, b_mem_space, c_mem_space, work_mem_space; - char *a_mtx_bin_file, *b_mtx_bin_file, *c_mtx_bin_file; + std::string a_mtx_bin_file, b_mtx_bin_file, c_mtx_bin_file; bool compression2step; int left_lower_triangle, right_lower_triangle; int left_sort, right_sort; @@ -62,7 +64,7 @@ struct Parameters { int cache_flush; double first_level_hash_cut_off; double compression_cut_off; - size_t MaxColDenseAcc; + int MaxColDenseAcc; // 0 - no flush // 1 - soft flush // 2 - hard flush with rand. @@ -74,24 +76,26 @@ struct Parameters { multi_color_scale = 1; shmemsize = 16128; team_size = -1; - use_dynamic_scheduling = 0; - verbose = 0; + use_dynamic_scheduling = false; + verbose = false; spgemm_step = '0'; vector_size = -1; - check_output = 0; + check_output = false; mkl_sort_option = 7; mkl_keep_output = 1; - calculate_read_write_cost = 0; - coloring_input_file = NULL; - coloring_output_file = NULL; + calculate_read_write_cost = false; + coloring_input_file = ""; + coloring_output_file = ""; minhashscale = 1; use_threads = 0; use_openmp = 0; use_cuda = 0; use_hip = 0; + use_sycl = 0; + use_openmptarget = 0; use_serial = 0; a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1; - a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL; + a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = ""; compression2step = true; left_lower_triangle = 0; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 761c919aac..95a3459699 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -62,52 +62,100 @@ #endif namespace Test { -template ::value> -struct multivector_layout_adapter; - -template -struct multivector_layout_adapter { - typedef typename ViewType::value_type Scalar; - typedef typename ViewType::device_type Device; - typedef Kokkos::View - BaseTypeRight; - typedef Kokkos::View - BaseTypeDefault; - typedef - typename std::conditional::value, - BaseTypeRight, BaseTypeDefault>::type BaseType; - - static ViewType view(const BaseType& v) { - return Kokkos::subview(v, Kokkos::ALL, Kokkos::ALL, 0); - }; -}; -template -struct multivector_layout_adapter { - typedef typename ViewType::value_type Scalar; - typedef typename ViewType::device_type Device; - typedef Kokkos::View - BaseTypeRight; - typedef Kokkos::View - BaseTypeDefault; - typedef - typename std::conditional::value, - BaseTypeRight, BaseTypeDefault>::type BaseType; - - static ViewType view(const BaseType& v) { - return Kokkos::subview(v, Kokkos::ALL, Kokkos::ALL); - }; +// Utility class for testing kernels with rank-1 and rank-2 views that may be +// LayoutStride. Simplifies making a LayoutStride view of a given size that is +// actually noncontiguous, and host-device transfers for checking results on +// host. +// +// Constructed with label and extent(s), and then provides 5 views as members: +// - d_view, and a const-valued alias d_view_const +// - h_view +// - d_base +// - h_base +// d_view is of type ViewType, and has the extents passed to the constructor. +// h_view is a mirror of d_view. +// d_base (and its mirror h_base) are contiguous views, so they can be +// deep-copied to each other. d_view aliases d_base, and h_view aliases h_base. +// This means that copying between d_base and h_base +// also copies between d_view and h_view. +// +// If the Boolean template parameter 'createMirrorView' is: +// - 'true' (default value), then this utility class will use +// Kokkos::create_mirror_view(); +// - 'false', then this utility class will use Kokkos::create_mirror() +template +struct view_stride_adapter { + static_assert(Kokkos::is_view_v, + "view_stride_adapter: ViewType must be a Kokkos::View"); + static_assert(ViewType::rank >= 1 && ViewType::rank <= 2, + "view_stride_adapter: ViewType must be rank 1 or rank 2"); + + static constexpr bool strided = std::is_same::value; + static constexpr int rank = ViewType::rank; + + using DView = ViewType; + using HView = typename DView::HostMirror; + // If not strided, the base view types are the same as DView/HView. + // But if strided, the base views have one additional dimension, so that + // d_view/h_view have stride > 1 between consecutive elements. + using DViewBase = std::conditional_t< + strided, + Kokkos::View, + DView>; + using HViewBase = typename DViewBase::HostMirror; + + view_stride_adapter(const std::string& label, int m, int n = 1) { + if constexpr (rank == 1) { + if constexpr (strided) { + d_base = DViewBase(label, m, 2); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); + d_view = Kokkos::subview(d_base, Kokkos::ALL(), 0); + h_view = Kokkos::subview(h_base, Kokkos::ALL(), 0); + } else { + d_base = DViewBase(label, m); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); + d_view = d_base; + h_view = h_base; + } + } else { + if constexpr (strided) { + d_base = DViewBase(label, m, n, 2); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); + d_view = + Kokkos::subview(d_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); + h_view = + Kokkos::subview(h_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); + } else { + d_base = DViewBase(label, m, n); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); + d_view = d_base; + h_view = h_base; + } + } + d_view_const = d_view; + } + + // Have both const and nonconst versions of d_view (with same underlying + // data), since we often test BLAS with both + DView d_view; + typename DView::const_type d_view_const; + HView h_view; + DViewBase d_base; + HViewBase h_base; }; template void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol, std::string msg = "") { - typedef Kokkos::Details::ArithTraits AT1; - typedef Kokkos::Details::ArithTraits AT3; + typedef Kokkos::ArithTraits AT1; + typedef Kokkos::ArithTraits AT3; EXPECT_LE((double)AT1::abs(val1 - val2), (double)AT3::abs(tol)) << msg; } @@ -116,8 +164,8 @@ void EXPECT_NEAR_KK_REL(Scalar1 val1, Scalar2 val2, Scalar3 tol, std::string msg = "") { typedef typename std::remove_reference::type hv1_type; typedef typename std::remove_reference::type hv2_type; - const auto ahv1 = Kokkos::Details::ArithTraits::abs(val1); - const auto ahv2 = Kokkos::Details::ArithTraits::abs(val2); + const auto ahv1 = Kokkos::ArithTraits::abs(val1); + const auto ahv2 = Kokkos::ArithTraits::abs(val2); EXPECT_NEAR_KK(val1, val2, tol * Kokkos::max(ahv1, ahv2), msg); } @@ -205,7 +253,7 @@ struct SharedVanillaGEMM { typedef Kokkos::View SubviewTypeB; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -520,6 +568,68 @@ int string_compare_no_case(const char* str1, const char* str2) { return strcmp(str1_s.c_str(), str2_s.c_str()); } +int string_compare_no_case(const std::string& str1, const std::string& str2) { + return string_compare_no_case(str1.c_str(), str2.c_str()); +} +/// /brief Coo matrix class for testing purposes. +/// \tparam ScalarType +/// \tparam LayoutType +/// \tparam ExeSpaceType +template +class RandCooMat { + private: + using RowViewTypeD = Kokkos::View; + using ColViewTypeD = Kokkos::View; + using DataViewTypeD = Kokkos::View; + RowViewTypeD __row_d; + ColViewTypeD __col_d; + DataViewTypeD __data_d; + + template + T __getter_copy_helper(T src) { + T dst(std::string("RandCooMat.") + typeid(T).name() + " copy", + src.extent(0)); + Kokkos::deep_copy(dst, src); + ExeSpaceType().fence(); + return dst; + } + + public: + std::string info; + /// Constructs a random coo matrix with negative indices. + /// \param m The max row id + /// \param n The max col id + /// \param n_tuples The number of tuples. + /// \param min_val The minimum scalar value in the matrix. + /// \param max_val The maximum scalar value in the matrix. + RandCooMat(int64_t m, int64_t n, int64_t n_tuples, ScalarType min_val, + ScalarType max_val) { + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + + info = std::string(std::string("RandCooMat<") + typeid(ScalarType).name() + + ", " + typeid(LayoutType).name() + ", " + + typeid(ExeSpaceType).name() + std::to_string(n) + + "...): rand seed: " + std::to_string(ticks) + "\n"); + Kokkos::Random_XorShift64_Pool random(ticks); + + __row_d = RowViewTypeD("RandCooMat.RowViewType", n_tuples); + Kokkos::fill_random(__row_d, random, -m, m); + + __col_d = ColViewTypeD("RandCooMat.ColViewType", n_tuples); + Kokkos::fill_random(__col_d, random, -n, n); + + __data_d = DataViewTypeD("RandCooMat.DataViewType", n_tuples); + Kokkos::fill_random(__data_d, random, min_val, max_val); + + ExeSpaceType().fence(); + } + auto get_row() { return __getter_copy_helper(__row_d); } + auto get_col() { return __getter_copy_helper(__col_d); } + auto get_data() { return __getter_copy_helper(__data_d); } +}; + /// /brief Cs (Compressed Sparse) matrix class for testing purposes. /// This class is for testing purposes only and will generate a random /// Crs / Ccs matrix when instantiated. The class is intentionally written @@ -528,16 +638,27 @@ int string_compare_no_case(const char* str1, const char* str2) { /// dim2 refers to either columns for a Crs matrix or rows for a Ccs matrix. /// \tparam ScalarType /// \tparam LayoutType -/// \tparam ExeSpaceType -template +/// \tparam Device +template ::size_type> class RandCsMatrix { + public: + using value_type = ScalarType; + using array_layout = LayoutType; + using device_type = Device; + using ordinal_type = Ordinal; + using size_type = Size; + using ValViewTypeD = Kokkos::View; + using IdViewTypeD = Kokkos::View; + using MapViewTypeD = Kokkos::View; + private: - using ValViewTypeD = Kokkos::View; - using IdViewTypeD = Kokkos::View; - using MapViewTypeD = Kokkos::View; - int64_t __dim2; - int64_t __dim1; - int64_t __nnz = 0; + using execution_space = typename Device::execution_space; + Ordinal __dim2; + Ordinal __dim1; + Size __nnz = 0; MapViewTypeD __map_d; IdViewTypeD __ids_d; ValViewTypeD __vals_d; @@ -556,19 +677,19 @@ class RandCsMatrix { /// 4. __map(i) - col_map(i - 1) is in [0, m] void __populate_random_cs_mat(uint64_t ticks) { std::srand(ticks); - for (int64_t col_idx = 0; col_idx < __dim1; col_idx++) { - int64_t r = std::rand() % (__dim2 + 1); + for (Ordinal col_idx = 0; col_idx < __dim1; col_idx++) { + Ordinal r = std::rand() % (__dim2 + 1); if (r == 0 || __fully_sparse) { // 100% sparse vector __map(col_idx) = __nnz; } else { // sparse vector with r elements // Populate r row ids - std::vector v(r); + std::vector v(r); - for (int64_t i = 0; i < r; i++) v.at(i) = i; + for (Ordinal i = 0; i < r; i++) v.at(i) = i; std::shuffle(v.begin(), v.end(), std::mt19937(std::random_device()())); - for (int64_t i = 0; i < r; i++) __ids(i + __nnz) = v.at(i); + for (Ordinal i = 0; i < r; i++) __ids(i + __nnz) = v.at(i); // Point to new column and accumulate number of non zeros __map(col_idx) = __nnz; @@ -581,8 +702,13 @@ class RandCsMatrix { // Copy to device Kokkos::deep_copy(__map_d, __map); - Kokkos::deep_copy(__ids_d, __ids); - ExeSpaceType().fence(); + IdViewTypeD tight_ids(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "RandCsMatrix.IdViewTypeD"), + __nnz); + Kokkos::deep_copy( + tight_ids, + Kokkos::subview(__ids, Kokkos::make_pair(0, static_cast(__nnz)))); + __ids_d = tight_ids; } template @@ -590,7 +716,6 @@ class RandCsMatrix { T dst(std::string("RandCsMatrix.") + typeid(T).name() + " copy", src.extent(0)); Kokkos::deep_copy(dst, src); - ExeSpaceType().fence(); return dst; } @@ -601,7 +726,7 @@ class RandCsMatrix { /// \param dim2 The second dimension: columns for Crs or rows for Ccs /// \param min_val The minimum scalar value in the matrix. /// \param max_val The maximum scalar value in the matrix. - RandCsMatrix(int64_t dim1, int64_t dim2, ScalarType min_val, + RandCsMatrix(Ordinal dim1, Ordinal dim2, ScalarType min_val, ScalarType max_val, bool fully_sparse = false) { __dim1 = dim1; __dim2 = dim2; @@ -618,7 +743,7 @@ class RandCsMatrix { info = std::string( std::string("RandCsMatrix<") + typeid(ScalarType).name() + ", " + - typeid(LayoutType).name() + ", " + typeid(ExeSpaceType).name() + ">(" + + typeid(LayoutType).name() + ", " + execution_space().name() + ">(" + std::to_string(dim2) + ", " + std::to_string(dim1) + "...): rand seed: " + std::to_string(ticks) + ", fully sparse: " + (__fully_sparse ? "true" : "false") + "\n"); @@ -633,17 +758,16 @@ class RandCsMatrix { // Copy to device Kokkos::deep_copy(__vals_d, __vals); - ExeSpaceType().fence(); } // O(c), where c is a constant. - ScalarType operator()(int64_t idx) { return __vals(idx); } - int64_t get_nnz() { return __nnz; } + ScalarType operator()(Size idx) { return __vals(idx); } + size_t get_nnz() { return size_t(__nnz); } // dimension2: This is either columns for a Crs matrix or rows for a Ccs // matrix. - int64_t get_dim2() { return __dim2; } + Ordinal get_dim2() { return __dim2; } // dimension1: This is either rows for Crs matrix or columns for a Ccs matrix. - int64_t get_dim1() { return __dim1; } + Ordinal get_dim1() { return __dim1; } ValViewTypeD get_vals() { return __getter_copy_helper(__vals_d); } IdViewTypeD get_ids() { return __getter_copy_helper(__ids_d); } MapViewTypeD get_map() { return __getter_copy_helper(__map_d); }