diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2488790254..558b6bd96d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -11,15 +11,16 @@ permissions: jobs: docs-check: - runs-on: ubuntu-latest + runs-on: [macos-latest] steps: - name: Install Dependencies run: | - sudo apt-get update - sudo apt-get install --no-install-recommends doxygen-latex - pip install sphinx - pip install breathe - pip install sphinx-rtd-theme + brew install doxygen + python3 -m pip install sphinx -v "sphinx==6.2.1" + python3 -m pip install breathe + python3 -m pip install sphinx-rtd-theme + sphinx-build --version + doxygen --version - name: checkout_kokkos_kernels uses: actions/checkout@v3 diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 8a5681f9c7..df6066d0d4 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -111,4 +111,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure --timeout 3600 + run: ctest -j2 --output-on-failure --timeout 7200 diff --git a/CHANGELOG.md b/CHANGELOG.md index 91268a35fd..6bfc00b4fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,126 @@ # Change Log +## [4.2.00](https://github.com/kokkos/kokkos-kernels/tree/4.2.00) (2023-11-06) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.1.00...4.2.00) + +### New Features + +#### BLAS updates +- Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### LAPACK +- New component added for the implementation of LAPACK algorithms and to support associated TPLs [\#1985](https://github.com/kokkos/kokkos-kernels/pull/1985) +- Fix some issue with unit-test definition for SYCL backend in the new LAPACK component [\#2024](https://github.com/kokkos/kokkos-kernels/pull/2024) + +#### Sparse updates +- Extract diagonal blocks from a CRS matrix into separate CRS matrices [\#1947](https://github.com/kokkos/kokkos-kernels/pull/1947) +- Adding exec space instance to spmv [\#1932](https://github.com/kokkos/kokkos-kernels/pull/1932) +- Add merge-based SpMV [\#1911](https://github.com/kokkos/kokkos-kernels/pull/1911) +- Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (PSGS and Team_PSGS) [\#1906](https://github.com/kokkos/kokkos-kernels/pull/1906) +- Add a MergeMatrixDiagonal abstraction to KokkosSparse [\#1780](https://github.com/kokkos/kokkos-kernels/pull/1780) + +#### ODE updates +- Newton solver [\#1924](https://github.com/kokkos/kokkos-kernels/pull/1924) + +### Enhancements: + +#### Sparse +- MDF performance improvements exposing more parallelism in the implementation + - MDF: convert remaining count functor to hierarchical parallelism [\#1894](https://github.com/kokkos/kokkos-kernels/pull/1894) + - MDF: move most expensive kernels over to hierarchical parallelism [\#1893](https://github.com/kokkos/kokkos-kernels/pull/1893) +- Improvements to the Block Crs Matrix-Vector multiplication algorithm + - Improve BSR matrix SpMV Performance [\#1740](https://github.com/kokkos/kokkos-kernels/pull/1740) + - Disallow BsrMatrix tensor-core SpMV on non-scalar types [\#1937](https://github.com/kokkos/kokkos-kernels/pull/1937) + - remove triplicate sanity checks in BsrMatrix [\#1923](https://github.com/kokkos/kokkos-kernels/pull/1923) + - remove duplicate BSR SpMV tests [\#1922](https://github.com/kokkos/kokkos-kernels/pull/1922) +- Only deep_copy from device to host if supernodal sptrsv algorithms are used [\#1993](https://github.com/kokkos/kokkos-kernels/pull/1993) +- Improve KokkosSparse_kk_spmv [\#1979](https://github.com/kokkos/kokkos-kernels/pull/1979) + - Add 5 warm-up calls to get accurate, consistent timing + - Print out the matrix dimensions correctly when loading from disk +- sparse/impl: Make PSGS non-blocking [\#1917](https://github.com/kokkos/kokkos-kernels/pull/1917) + +#### ODE +- ODE: changing layout of temp mem in RK algorithms [\#1908](https://github.com/kokkos/kokkos-kernels/pull/1908) +- ODE: adding adaptivity test for RK methods [\#1896](https://github.com/kokkos/kokkos-kernels/pull/1896) + +#### Common utilities +- Common: remove half and bhalf implementations (now in Kokkos Core) [\#1981](https://github.com/kokkos/kokkos-kernels/pull/1981) +- KokkosKernels: switching from printf macro to function [\#1977](https://github.com/kokkos/kokkos-kernels/pull/1977) +- OrdinalTraits: constexpr functions [\#1976](https://github.com/kokkos/kokkos-kernels/pull/1976) +- Parallel prefix sum can infer view type [\#1974](https://github.com/kokkos/kokkos-kernels/pull/1974) + +#### TPL support +- BSPGEMM: removing cusparse testing for version older than 11.4.0 [\#1996](https://github.com/kokkos/kokkos-kernels/pull/1996) +- Revise KokkosBlas::nrm2 TPL implementation [\#1950](https://github.com/kokkos/kokkos-kernels/pull/1950) +- Add TPL oneMKL GEMV support [\#1912](https://github.com/kokkos/kokkos-kernels/pull/1912) +- oneMKL spmv [\#1882](https://github.com/kokkos/kokkos-kernels/pull/1882) + +### Build System: +- CMakeLists.txt: Update Kokkos version to 4.2.99 for version check [\#2003](https://github.com/kokkos/kokkos-kernels/pull/2003) +- CMake: Adding logic to catch bad Kokkos version [\#1990](https://github.com/kokkos/kokkos-kernels/pull/1990) +- Remove calling tribits_exclude_autotools_files() [\#1888](https://github.com/kokkos/kokkos-kernels/pull/1888) + +### Documentation and Testing: +- Update create_gs_handle docs [\#1958](https://github.com/kokkos/kokkos-kernels/pull/1958) +- docs: Add testing table [\#1876](https://github.com/kokkos/kokkos-kernels/pull/1876) +- docs: Note which builds have ETI disabled [\#1934](https://github.com/kokkos/kokkos-kernels/pull/1934) +- Generate HTML docs [\#1921](https://github.com/kokkos/kokkos-kernels/pull/1921) +- github/workflows: Pin sphinx version [\#1948](https://github.com/kokkos/kokkos-kernels/pull/1948) +- github/workflows/docs.yml: Use up-to-date doxygen version [\#1941](https://github.com/kokkos/kokkos-kernels/pull/1941) + +- Unit-Test: adding specific test for block sparse functions [\#1944](https://github.com/kokkos/kokkos-kernels/pull/1944) +- Update SYCL docker image to Cuda 11.7.1 [\#1939](https://github.com/kokkos/kokkos-kernels/pull/1939) +- Remove printouts from the unit tests of ger() and syr() [\#1933](https://github.com/kokkos/kokkos-kernels/pull/1933) +- update testing scripts [\#1960](https://github.com/kokkos/kokkos-kernels/pull/1960) +- Speed up BSR spmv tests [\#1945](https://github.com/kokkos/kokkos-kernels/pull/1945) +- Test_ODE_Newton: Add template parameters for Kokkos::pair [\#1929](https://github.com/kokkos/kokkos-kernels/pull/1929) +- par_ilut: Update documentation for fill_in_limit [\#2001](https://github.com/kokkos/kokkos-kernels/pull/2001) + +### Benchmarks: +- perf_test/sparse: Update GS perf_test for streams [\#1963](https://github.com/kokkos/kokkos-kernels/pull/1963) +- Batched sparse perf_tests: Don't write to source tree during build [\#1904](https://github.com/kokkos/kokkos-kernels/pull/1904) +- ParILUT bench: fix unused IS_GPU warning [\#1900](https://github.com/kokkos/kokkos-kernels/pull/1900) +- BsrMatrix SpMV Google Benchmark [\#1886](https://github.com/kokkos/kokkos-kernels/pull/1886) +- Use extraction timestamps for fetched Google Benchmark files [\#1881](https://github.com/kokkos/kokkos-kernels/pull/1881) +- Improve help text in perf tests [\#1875](https://github.com/kokkos/kokkos-kernels/pull/1875) + +### Cleanup: +- iostream clean-up in benchmarks [\#2004](https://github.com/kokkos/kokkos-kernels/pull/2004) +- Rename TestExecSpace to TestDevice [\#1970](https://github.com/kokkos/kokkos-kernels/pull/1970) +- remove Intel 2017 code (no longer supported) [\#1920](https://github.com/kokkos/kokkos-kernels/pull/1920) +- clean-up implementations for move of HIP outside of experimental [#1999](https://github.com/kokkos/kokkos-kernels/pull/1999) + +### Bug Fixes: +- upstream iostream removal fix [\#1991](https://github.com/kokkos/kokkos-kernels/pull/1991), [\#1995](https://github.com/kokkos/kokkos-kernels/pull/1995) +- Test and fix gemv stream interface [\#1987](https://github.com/kokkos/kokkos-kernels/pull/1987) +- Test_Sparse_spmv_bsr.hpp: Workaround cuda 11.2 compiler error [\#1983](https://github.com/kokkos/kokkos-kernels/pull/1983) +- Fix improper use of execution space instances in ODE tests. Better handling of CudaUVMSpaces during build. [\#1973](https://github.com/kokkos/kokkos-kernels/pull/1973) +- Don't assume the default memory space is used [\#1969](https://github.com/kokkos/kokkos-kernels/pull/1969) +- MDF: set default verbosity explicitly to avoid valgrind warnings [\#1968](https://github.com/kokkos/kokkos-kernels/pull/1968) +- Fix sort_and_merge functions for in-place case [\#1966](https://github.com/kokkos/kokkos-kernels/pull/1966) +- SPMV_Struct_Functor: initialize numExterior to 0 [\#1957](https://github.com/kokkos/kokkos-kernels/pull/1957) +- Use rank-1 impl types when rank-2 vector is dynamically rank 1 [\#1953](https://github.com/kokkos/kokkos-kernels/pull/1953) +- BsrMatrix: Check if CUDA is enabled before checking architecture [\#1955](https://github.com/kokkos/kokkos-kernels/pull/1955) +- Avoid enum without fixed underlying type to fix SYCL [\#1940](https://github.com/kokkos/kokkos-kernels/pull/1940) +- Fix SpAdd perf test when offset/ordinal is not int [\#1928](https://github.com/kokkos/kokkos-kernels/pull/1928) +- Add KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS definition for architectures with independent thread scheduling [\#1927](https://github.com/kokkos/kokkos-kernels/pull/1927) +- Fix cm_generate_makefile --boundscheck [\#1926](https://github.com/kokkos/kokkos-kernels/pull/1926) +- Bsr compatibility [\#1925](https://github.com/kokkos/kokkos-kernels/pull/1925) +- BLAS: fix assignable check in gemv and gemm [\#1914](https://github.com/kokkos/kokkos-kernels/pull/1914) +- mdf: fix initial value in select pivot functor [\#1916](https://github.com/kokkos/kokkos-kernels/pull/1916) +- add missing headers, std::vector -> std::vector<...> [\#1909](https://github.com/kokkos/kokkos-kernels/pull/1909) +- Add missing include to Test_Sparse_MergeMatrix.hpp [\#1907](https://github.com/kokkos/kokkos-kernels/pull/1907) +- Remove non-existant dir from CMake include paths [\#1892](https://github.com/kokkos/kokkos-kernels/pull/1892) +- cusparse 12 spmv: check y vector alignment [\#1889](https://github.com/kokkos/kokkos-kernels/pull/1889) +- Change 'or' to '||' to fix compilation on MSVC [\#1885](https://github.com/kokkos/kokkos-kernels/pull/1885) +- Add missing KokkosKernels_Macros.hpp include [\#1884](https://github.com/kokkos/kokkos-kernels/pull/1884) +- Backward-compatible fix with kokkos@4.0 [\#1874](https://github.com/kokkos/kokkos-kernels/pull/1874) +- Fix for rocblas builds [\#1871](https://github.com/kokkos/kokkos-kernels/pull/1871) +- Correcting 'syr test' bug causing compilation errors with Trilinos [\#1870](https://github.com/kokkos/kokkos-kernels/pull/1870) +- Workaround for spiluk and sptrsv stream tests with OMP_NUM_THREADS of 1, 2, 3 [\#1864](https://github.com/kokkos/kokkos-kernels/pull/1864) +- bhalf_t fix for isnan function [\#2007](https://github.com/kokkos/kokkos-kernels/pull/2007) + + ## [4.1.00](https://github.com/kokkos/kokkos-kernels/tree/4.1.00) (2023-06-16) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.01...4.1.00) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa666ab33e..8e990cece5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 1) +SET(KokkosKernels_VERSION_MINOR 2) SET(KokkosKernels_VERSION_PATCH 00) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") @@ -115,6 +115,7 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(batched/dense/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(batched/sparse/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) @@ -124,9 +125,16 @@ ELSE() # Regular build, not install testing # Do all the regular option processing IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) - # This is a standalone build - FIND_PACKAGE(Kokkos REQUIRED) - MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}") + # This is a standalone build + FIND_PACKAGE(Kokkos REQUIRED) + IF((${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.2.00")) + MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") + IF((${Kokkos_VERSION} VERSION_GREATER "4.2.99")) + MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") + ENDIF() + ELSE() + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.1.00, 4.2.00 or develop") + ENDIF() ENDIF() INCLUDE(cmake/kokkos_backends.cmake) @@ -185,7 +193,7 @@ ELSE() "ALL" STRING "A list of components to enable in testing and building" - VALID_ENTRIES BATCHED BLAS GRAPH SPARSE ALL + VALID_ENTRIES BATCHED BLAS LAPACK GRAPH SPARSE ALL ) # ================================================================== @@ -236,6 +244,7 @@ ELSE() MESSAGE(" COMMON: ON") MESSAGE(" BATCHED: ${KokkosKernels_ENABLE_COMPONENT_BATCHED}") MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") + MESSAGE(" LAPACK: ${KokkosKernels_ENABLE_COMPONENT_LAPACK}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") @@ -280,6 +289,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_BLAS) INCLUDE(blas/CMakeLists.txt) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) + INCLUDE(lapack/CMakeLists.txt) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) INCLUDE(graph/CMakeLists.txt) ENDIF() @@ -398,6 +410,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_BLAS) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_LAPACK) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(lapack/unit_test) + ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_GRAPH) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) ENDIF() diff --git a/README.md b/README.md index 58127b912e..0da1057870 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ For a complete list of tunable Kokkos options, run spack info kokkos ```` -#### Settuping a development environment with Spack +#### Setting up a development environment with Spack Spack is generally most useful for installng packages to use. If you want to install all *dependencies* of Kokkos Kernels first so that you can actively develop a given Kokkos Kernels source this can still be done. Go to the Kokkos Kernels source code folder and run: ```` diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 27fb0bf338..9078281e59 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -31,10 +31,12 @@ #include #include +#include #include "Kokkos_Complex.hpp" #include "KokkosKernels_config.h" +#include "KokkosKernels_Macros.hpp" #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosBlas_util.hpp" diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index beaef112f3..400c46544d 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -199,17 +199,31 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -249,17 +263,31 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -304,17 +332,31 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::axpy: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::axpy: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 2f0be4b661..5b693bb87a 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -59,10 +59,17 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#endif return 1; } #endif @@ -87,10 +94,17 @@ KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#endif return 1; } #endif @@ -143,12 +157,21 @@ struct TeamCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif @@ -181,12 +204,21 @@ struct TeamCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif @@ -245,12 +277,21 @@ struct TeamVectorCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif @@ -283,12 +324,21 @@ struct TeamVectorCopy { // Check compatibility of dimensions at run time. if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " "%d, " "B: %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index a6a7673e7b..854069289e 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -186,19 +186,35 @@ struct SerialDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(1) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: Second dimension of X and alpha do not match: " + "X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -232,18 +248,33 @@ struct SerialDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: First dimension of X and alpha do not match: X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -282,19 +313,35 @@ struct TeamDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(1) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: Second dimension of X and alpha do not match: " + "X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -337,18 +384,33 @@ struct TeamDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: First dimension of X and alpha do not match: X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -395,19 +457,35 @@ struct TeamVectorDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(1) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Second dimension of X and alpha do not match: " "X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: Second dimension of X and alpha do not match: " + "X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif @@ -450,18 +528,33 @@ struct TeamVectorDot { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::dot: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != dot.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::dot: First dimension of X and alpha do not match: X: " "%d x %d, dot: %d\n", (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::dot: First dimension of X and alpha do not match: X: " + "%d x %d, dot: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)dot.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 0ef43ee4f8..e4e0d5b8b7 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -384,22 +384,39 @@ struct SerialGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " "%d x %d, tmp (note: its second dimension should be the second " "dimension of A + 4): %d x %d\n", (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), (int)tmp.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " + "%d x %d, tmp (note: its second dimension should be the second " + "dimension of A + 4): %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), + (int)tmp.extent(1)); +#endif return 1; } if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -414,9 +431,15 @@ struct SerialGesv { if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); +#else + Kokkos::printf( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); +#endif return 1; } @@ -458,11 +481,19 @@ struct SerialGesv { if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -509,11 +540,19 @@ struct TeamGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -532,9 +571,15 @@ struct TeamGesv { if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); +#else + Kokkos::printf( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); +#endif return 1; } member.team_barrier(); @@ -587,11 +632,19 @@ struct TeamGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -645,11 +698,19 @@ struct TeamVectorGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif @@ -668,9 +729,15 @@ struct TeamVectorGesv { if (TeamVectorStaticPivoting::invoke( member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: the currently implemented static pivoting " "failed.\n"); +#else + Kokkos::printf( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); +#endif return 1; } @@ -724,11 +791,19 @@ struct TeamVectorGesv { // Check compatibility of dimensions at run time. if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || A.extent(0) != Y.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " "%d x %d, X: %d, Y: %d\n", (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index ebd789c2e8..0570bc4ccc 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -110,19 +110,35 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " + "X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " + "X: %d x %d, " + "V: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#endif return 1; } #endif @@ -161,19 +177,35 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " + "X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " + "X: %d x %d, " + "V: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#endif return 1; } #endif @@ -214,19 +246,35 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " "X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and Y do not match: " + "X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != V.extent(0) || X.extent(1) != V.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " "X: %d x %d, " "V: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::HadamardProduct: Dimensions of X and V do not match: " + "X: %d x %d, " + "V: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)V.extent(0), (int)V.extent(1)); +#endif return 1; } #endif diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index f413ba612c..f70fa6b963 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -49,8 +49,7 @@ constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { // buffering algorithm by a factor of 2. #if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) template <> -constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dbl_buf_tile_k() { +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { return 16; } #endif diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 4f90c0be38..5e5b7e13cc 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -204,17 +204,31 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -247,17 +261,31 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif @@ -291,17 +319,31 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " "Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::xpay: Dimensions of X and Y do not match: X: %d x %d, " + "Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " "%d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::xpay: First dimension of X and alpha do not match: X: " + "%d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } #endif diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 4f62d0b0d4..4725e0220d 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -36,14 +36,17 @@ namespace KokkosBatched { /// C = alpha * op(A) * op(B) + beta * C /// /// \tparam ArgTransA Specifies what op does to A: +/// /// Trans::NoTranspose for non-transpose /// Trans::Transpose for transpose /// Trans::ConjTranspose for conjugate transpose /// \tparam ArgTransB Specifies what op does to B: +/// /// Trans::NoTranspose for non-transpose /// Trans::Transpose for transpose /// Trans::ConjTranspose for conjugate transpose /// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// /// AViewType, BViewType, and CViewType: /// BatchLayout::Left Batch dimension is leftmost /// BatchLayout::Right Batch dimension is rightmost @@ -61,13 +64,16 @@ namespace KokkosBatched { /// See struct BatchedGemmHandle for details. /// \param alpha [in] Input coefficient used for multiplication with A /// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// /// If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB /// If ArgBatchSzDim == "BatchLayout::Left", matrix A is BxMxK /// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// /// If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB /// If ArgBatchSzDim == "BatchLayout::Left", matrix B is BxKxN /// \param beta [in] Input coefficient used for multiplication with C /// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// /// If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB /// If ArgBatchSzDim == "BatchLayout::Left", matrix C is BxMxN /// \return 0 upon success, non-zero otherwise diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 23fd62655a..71d159cb03 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -120,21 +120,19 @@ struct DefaultVectorLength, Kokkos::CudaUVMSpace> { #if defined(KOKKOS_ENABLE_HIP) template <> -struct DefaultVectorLength { +struct DefaultVectorLength { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength { +struct DefaultVectorLength { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultVectorLength, Kokkos::HIPSpace> { enum : int { value = 16 }; }; template <> -struct DefaultVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultVectorLength, Kokkos::HIPSpace> { enum : int { value = 16 }; }; #endif @@ -189,21 +187,19 @@ struct DefaultInternalVectorLength, #if defined(KOKKOS_ENABLE_HIP) template <> -struct DefaultInternalVectorLength { +struct DefaultInternalVectorLength { enum : int { value = 8 }; }; template <> -struct DefaultInternalVectorLength { +struct DefaultInternalVectorLength { enum : int { value = 4 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultInternalVectorLength, Kokkos::HIPSpace> { enum : int { value = 4 }; }; template <> -struct DefaultInternalVectorLength, - Kokkos::Experimental::HIPSpace> { +struct DefaultInternalVectorLength, Kokkos::HIPSpace> { enum : int { value = 2 }; }; #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index a2b9edf1e6..3c58f432ec 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -20,32 +20,32 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, @@ -54,32 +54,32 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); + test_batched_gemm, Kokkos::complex, + param_tag_type>(); } #endif @@ -90,7 +90,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { @@ -98,7 +98,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { @@ -106,7 +106,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { @@ -114,7 +114,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ @@ -124,7 +124,7 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { @@ -132,7 +132,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { @@ -140,7 +140,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { @@ -148,7 +148,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 00561e0317..62a4a291a8 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -25,7 +25,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { @@ -33,7 +33,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { @@ -41,7 +41,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { @@ -49,7 +49,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ @@ -58,7 +58,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { @@ -66,7 +66,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { @@ -74,7 +74,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { @@ -82,7 +82,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT @@ -98,32 +98,32 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { @@ -131,32 +131,32 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -167,28 +167,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { @@ -196,28 +196,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif @@ -228,28 +228,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { ::Test::SharedParamTag; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { @@ -257,27 +257,27 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp index 2bde3f7fad..90ce5addc3 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp @@ -30,6 +30,7 @@ namespace Axpy { template struct Functor_TestBatchedSerialAxpy { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ViewType _X; const ViewType _Y; @@ -54,7 +55,7 @@ struct Functor_TestBatchedSerialAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::RangePolicy policy(0, _X.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp index de677b1045..ed647f1e3b 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Complex.hpp @@ -16,11 +16,11 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_dcomplex) { - test_batched_axpy, + test_batched_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_serial_axpy_nt_dcomplex_double) { - test_batched_axpy, double>(); + test_batched_axpy, double>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp index 078e4bfa8f..3f1f6af2fd 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_float_float) { - test_batched_axpy(); + test_batched_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_axpy_nt_double_double) { - test_batched_axpy(); + test_batched_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp index 5e90d5ae45..7eb2b89c83 100644 --- a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition.hpp @@ -34,6 +34,7 @@ namespace Test { typename ViewRank2Type, typename WorkViewType> struct Functor_TestBatchedSerialEigendecomposition { + using execution_space = typename DeviceType::execution_space; ViewRank3Type _A; ViewRank2Type _Er, _Ei; ViewRank3Type _UL, _UR; @@ -70,7 +71,7 @@ namespace Test { >::value ? "::ComplexFloat" : std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion( name.c_str() ); - Kokkos::RangePolicy policy(0, _A.extent(0)); + Kokkos::RangePolicy policy(0, _A.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp index 2e82468f8b..ad4b790717 100644 --- a/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialEigendecomposition_Real.hpp @@ -16,13 +16,13 @@ /* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_float ) { - test_batched_serial_eigendecomposition(); + test_batched_serial_eigendecomposition(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_double ) { - test_batched_serial_eigendecomposition(); + test_batched_serial_eigendecomposition(); } #endif */ diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index 8304657849..7f27fa7dcf 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -40,6 +40,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -66,7 +67,7 @@ struct Functor_TestBatchedSerialGemm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::RangePolicy policy(0, _c.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp index 01988c9e51..f671292c98 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Complex.hpp @@ -21,39 +21,39 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_dcomplex) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_serial_gemm_nt_ct_dcomplex_dcomplex ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_gemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -62,39 +62,39 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_dcomplex_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm, double, - param_tag_type, algo_tag_type>(); + test_batched_gemm, double, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_gemm,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_serial_gemm_nt_ct_dcomplex_double ) { // typedef ::Test::Gemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_gemm,double,param_tag_type,algo_tag_type>(); +// test_batched_gemm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp index c32556c229..6f074867d9 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm_Real.hpp @@ -18,10 +18,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -29,10 +29,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -40,10 +40,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -51,10 +51,10 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_bhalf_bhalf) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } @@ -65,45 +65,37 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_half_half) { typedef ::Test::Gemm::ParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); + test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -112,29 +104,25 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_float_float) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif @@ -143,28 +131,28 @@ TEST_F(TestCategory, batched_scalar_serial_gemm_nt_nt_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_nt_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_nt_t_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_serial_gemm_t_t_double_double) { typedef ::Test::Gemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_gemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp index 3b17d81d48..bb05fab3bb 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp @@ -35,6 +35,7 @@ namespace Gesv { template struct Functor_TestBatchedSerialGesv { + using execution_space = typename DeviceType::execution_space; const MatrixType _A; const MatrixType _tmp; const VectorType _X; @@ -61,7 +62,7 @@ struct Functor_TestBatchedSerialGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::RangePolicy policy(0, _X.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp index 2b2493506e..00161ecb70 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv_Real.hpp @@ -15,20 +15,18 @@ //@HEADER #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_float) { - test_batched_gesv(); + test_batched_gesv(); } TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_float) { - test_batched_gesv(); + test_batched_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_double) { - test_batched_gesv(); + test_batched_gesv(); } TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_double) { - test_batched_gesv(); + test_batched_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index d3cbd6c024..23ded73e25 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -72,7 +73,7 @@ struct Functor_BatchedSerialGemm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::RangePolicy policy(0, _c.extent(0)); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -80,6 +81,7 @@ struct Functor_BatchedSerialGemm { template struct Functor_BatchedSerialLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -100,7 +102,7 @@ struct Functor_BatchedSerialLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -109,6 +111,7 @@ struct Functor_BatchedSerialLU { template struct Functor_TestBatchedSerialInverseLU { + using execution_space = typename DeviceType::execution_space; AViewType _a; WViewType _w; @@ -130,7 +133,7 @@ struct Functor_TestBatchedSerialInverseLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::InverseLUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp index 0be3375715..243ed21908 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_serial_inverselu_dcomplex) { // printf("Batched serial inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched serial inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp index 7eea2c9627..143db37b0c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_inverselu_float) { // printf("Batched serial inverse LU - float - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched serial inverse LU - float - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_inverselu_double) { // printf("Batched serial inverse LU - double - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched serial inverse LU - double - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialLU.hpp b/batched/dense/unit_test/Test_Batched_SerialLU.hpp index 23b72893b2..87224aa888 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU.hpp @@ -32,6 +32,7 @@ namespace Test { template struct Functor_TestBatchedSerialLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -52,7 +53,7 @@ struct Functor_TestBatchedSerialLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp index 29936b7825..b07bece091 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU_Complex.hpp @@ -17,6 +17,6 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_lu_dcomplex) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu, algo_tag_type>(); + test_batched_lu, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp index a185e3b520..ace508fab2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_lu_float) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_lu_double) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index 5aa832f0df..099fa9219f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -406,13 +406,14 @@ void GenerateTestData(ViewT data) { }); } -template +template void testIssue1786() { - using memory_space = typename ExeSpace::memory_space; + using execution_space = typename Device::execution_space; + using memory_space = typename Device::memory_space; constexpr int num_tests = 4; Kokkos::View matrices("data", num_tests); - GenerateTestData(matrices); + GenerateTestData(matrices); Kokkos::View Us("Us", matrices.extent(0)); Kokkos::View Ss("Ss", matrices.extent(0)); @@ -425,7 +426,7 @@ void testIssue1786() { "matrices_copy", matrices.extent(0)); // make a copy of the input data to avoid overwriting it Kokkos::deep_copy(matrices_copy, matrices); - auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); + auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); Kokkos::parallel_for( "polar decomposition", policy, KOKKOS_LAMBDA(int i) { auto matrix_copy = @@ -455,19 +456,19 @@ void testIssue1786() { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_svd_double) { // Test general SVD on a few different input sizes (full rank randomized) - testSVD(); - testSVD(); - testIssue1786(); - testIssue1786(); + testSVD(); + testSVD(); + testIssue1786(); + testIssue1786(); } #endif #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_svd_float) { // Test general SVD on a few different input sizes (full rank randomized) - testSVD(); - testSVD(); - testIssue1786(); - testIssue1786(); + testSVD(); + testSVD(); + testIssue1786(); + testIssue1786(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp index 48e8e5dead..43cb8fab2f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -72,7 +73,7 @@ struct Functor_BatchedSerialGemm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _c.extent(0)); + Kokkos::RangePolicy policy(0, _c.extent(0)); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -80,6 +81,7 @@ struct Functor_BatchedSerialGemm { template struct Functor_BatchedSerialLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -100,7 +102,7 @@ struct Functor_BatchedSerialLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -109,6 +111,7 @@ struct Functor_BatchedSerialLU { template struct Functor_TestBatchedSerialSolveLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; @@ -130,7 +133,7 @@ struct Functor_TestBatchedSerialSolveLU { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for((name + "::SolveLUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp index b0977189a4..6eaf9ca5aa 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_serial_solvelu_dcomplex) { // printf("Batched serial solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_solvelu, + test_batched_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched serial solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_solvelu, + test_batched_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp index f586e3b62c..37d768df98 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_solvelu_float) { // printf("Batched serial solveLU - float - algorithm type: Unblocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); // printf("Batched serial solveLU - float - algorithm type: Blocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_solvelu_double) { // printf("Batched serial solveLU - double - algorithm type: Unblocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); // printf("Batched serial solveLU - double - algorithm type: Blocked\n"); - test_batched_solvelu(); + test_batched_solvelu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index af38e62e4d..7a7e89ebf8 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -113,6 +113,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrmm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -138,7 +139,7 @@ struct Functor_TestBatchedSerialTrmm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp index 5718a81694..8ab6e2810c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Complex.hpp @@ -22,8 +22,8 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_scomplex_scomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) { @@ -77,8 +77,8 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_scomplex_scomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) { @@ -132,8 +132,8 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_scomplex_scomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_scomplex_scomplex) { typedef ::Test::Trmm::ParamTag, - Kokkos::complex, param_tag_type, algo_tag_type>(128); + test_batched_trmm, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } #endif @@ -190,7 +190,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -200,7 +200,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -210,7 +210,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -220,7 +220,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -230,7 +230,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -240,7 +240,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -251,7 +251,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -261,7 +261,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -271,7 +271,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -281,7 +281,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -291,7 +291,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -301,7 +301,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -312,7 +312,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -322,7 +322,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -332,7 +332,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -342,7 +342,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -352,7 +352,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -362,7 +362,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_dcomplex_dcomplex) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm, + test_batched_trmm, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp index c01ae8dbea..1cfc259dd3 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm_Real.hpp @@ -22,8 +22,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_float_float) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } // TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) { @@ -77,8 +71,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_float_float) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } // CONJUGATE TRANSPOSE TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) { @@ -132,8 +120,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_float_float) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_float_float) { typedef ::Test::Trmm::ParamTag(); + test_batched_trmm(); } #endif @@ -190,7 +172,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) { @@ -199,7 +181,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_nt_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) { @@ -208,7 +190,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) { @@ -217,7 +199,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_nt_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) { @@ -226,7 +208,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) { @@ -235,7 +217,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_nt_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } // TRANSPOSE @@ -245,7 +227,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) { @@ -254,7 +236,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_t_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) { @@ -263,7 +245,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) { @@ -272,7 +254,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_t_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) { @@ -281,7 +263,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) { @@ -290,7 +272,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_t_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } // CONJUGATE TRANSPOSE @@ -300,7 +282,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) { @@ -309,7 +291,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_l_ct_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) { @@ -318,7 +300,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) { @@ -327,7 +309,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_l_u_ct_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) { @@ -336,7 +318,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_u_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) { @@ -345,7 +327,7 @@ TEST_F(TestCategory, batched_scalar_serial_trmm_r_u_ct_n_double_double) { param_tag_type; typedef Algo::Trmm::Unblocked algo_tag_type; - test_batched_trmm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp index c0ef098652..f9418a804a 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp @@ -40,6 +40,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrsm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -65,7 +66,7 @@ struct Functor_TestBatchedSerialTrsm { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::RangePolicy policy(0, _b.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp index d7c52ccd77..be0005a74c 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Complex.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) { @@ -28,7 +28,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { @@ -36,7 +36,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_dcomplex ) @@ -44,14 +44,14 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_dcomplex) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_dcomplex) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) { @@ -59,7 +59,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // @@ -68,7 +68,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) { @@ -76,7 +76,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { @@ -84,7 +84,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, + test_batched_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_dcomplex ) @@ -92,7 +92,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_dcomplex) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_trsm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) { @@ -100,46 +100,46 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_nt_n_dcomplex_double ) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,double,param_tag_type,algo_tag_type>(); +// test_batched_trsm,double,param_tag_type,algo_tag_type>(); // } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) { @@ -147,29 +147,29 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_dcomplex_double) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm, double, - param_tag_type, algo_tag_type>(); + test_batched_trsm, double, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_serial_trsm_l_u_t_n_dcomplex_double ) { // typedef // ::Test::Trmm::ParamTag // param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; -// test_batched_trsm,double,param_tag_type,algo_tag_type>(); +// test_batched_trsm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp index c308071145..18b10a81e6 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm_Real.hpp @@ -20,48 +20,42 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } // TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) { @@ -69,32 +63,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_float_float) { typedef ::Test::Trmm::ParamTag param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); + test_batched_trsm(); } #endif @@ -104,7 +94,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) { @@ -112,7 +102,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_nt_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) { @@ -120,7 +110,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) { @@ -128,7 +118,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_nt_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) { @@ -136,7 +126,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) { @@ -144,7 +134,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_r_u_nt_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } // @@ -153,7 +143,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) { @@ -161,7 +151,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_l_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) { @@ -169,7 +159,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) { @@ -177,7 +167,7 @@ TEST_F(TestCategory, batched_scalar_serial_trsm_l_u_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp index f05a6f7fa5..512dce3bce 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp @@ -39,6 +39,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrsv { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -64,7 +65,7 @@ struct Functor_TestBatchedSerialTrsv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::RangePolicy policy(0, _b.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp index 1af71e7104..a524b9f97e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Complex.hpp @@ -19,28 +19,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_dcomplex) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, + test_batched_trsv, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -48,28 +48,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_dcomplex_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv, double, - param_tag_type, algo_tag_type>(); + test_batched_trsv, double, param_tag_type, + algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp index 71eb62b559..be1bf77b9e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv_Real.hpp @@ -19,29 +19,25 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_float_float) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); + test_batched_trsv(); } #endif @@ -50,28 +46,28 @@ TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_u_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_l_nt_n_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_u_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } TEST_F(TestCategory, batched_scalar_serial_trsv_u_nt_n_double_double) { typedef ::Test::Trsv::ParamTag param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; - test_batched_trsv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index 8f4ae64b7e..b09cadcb7e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -113,6 +113,7 @@ struct ParamTag { template struct Functor_TestBatchedSerialTrtri { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -132,7 +133,7 @@ struct Functor_TestBatchedSerialTrtri { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy policy(0, _a.extent(0)); Kokkos::parallel_for("Functor_TestBatchedSerialTrtri", policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp index db9d06ec06..0d8f2c72a6 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Complex.hpp @@ -20,33 +20,29 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_scomplex_scomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, - Kokkos::complex, param_tag_type, algo_tag_type>( - 128); + test_batched_trtri, Kokkos::complex, + param_tag_type, algo_tag_type>(128); } #endif @@ -56,7 +52,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -64,7 +60,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -72,7 +68,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } @@ -80,7 +76,7 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_dcomplex_dcomplex) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri, + test_batched_trtri, Kokkos::complex, param_tag_type, algo_tag_type>( 128); } diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp index 48617506de..952994d207 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri_Real.hpp @@ -20,29 +20,25 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_float_float) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); + test_batched_trtri(); } #endif @@ -52,28 +48,28 @@ TEST_F(TestCategory, batched_scalar_serial_trtri_u_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_u_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_n_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } TEST_F(TestCategory, batched_scalar_serial_trtri_l_u_double_double) { typedef ::Test::Trtri::ParamTag param_tag_type; typedef Algo::Trtri::Unblocked algo_tag_type; - test_batched_trtri(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp index 7941fc0284..b43b498607 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp @@ -30,6 +30,7 @@ namespace TeamAxpy { template struct Functor_TestBatchedTeamAxpy { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ViewType _X; const ViewType _Y; @@ -65,8 +66,8 @@ struct Functor_TestBatchedTeamAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp index 79c25ba9dc..b95b769fcc 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Complex.hpp @@ -16,11 +16,11 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_dcomplex) { - test_batched_team_axpy, + test_batched_team_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_team_axpy_nt_dcomplex_double) { - test_batched_team_axpy, double>(); + test_batched_team_axpy, double>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp index 967bfa8e46..ac458d4a55 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_axpy_nt_float_float) { - test_batched_team_axpy(); + test_batched_team_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_axpy_nt_double_double) { - test_batched_team_axpy(); + test_batched_team_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp index 9023a009af..2d952889c9 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp @@ -41,6 +41,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -73,8 +74,8 @@ struct Functor_TestBatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp index ebe22e6e1d..09c7f3f2cc 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Complex.hpp @@ -22,7 +22,7 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -30,7 +30,7 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -38,7 +38,7 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -46,19 +46,19 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_dcomplex) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, + test_batched_teamgemm, Kokkos::complex, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_dcomplex ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_dcomplex ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -67,39 +67,39 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_dcomplex_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm, double, + test_batched_teamgemm, double, param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemm_ct_nt_dcomplex_double ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_gemm_nt_ct_dcomplex_double ) { // typedef ::Test::TeamGemm::ParamTag // param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; -// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); +// test_batched_teamgemm,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp index f109fa4bf9..b1a5135018 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm_Real.hpp @@ -18,10 +18,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -29,10 +29,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -40,10 +40,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -51,10 +51,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_bhalf_bhalf) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -65,10 +65,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -76,10 +76,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -87,10 +87,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -98,10 +98,10 @@ TEST_F(TestCategory, batched_scalar_team_gemm_t_t_half_half) { typedef ::Test::TeamGemm::ParamTag param_tag_type; - test_batched_teamgemm(); - test_batched_teamgemm(); } @@ -112,28 +112,28 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_float_float) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } #endif @@ -143,28 +143,28 @@ TEST_F(TestCategory, batched_scalar_team_gemm_nt_nt_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_nt_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_nt_t_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } TEST_F(TestCategory, batched_scalar_team_gemm_t_t_double_double) { typedef ::Test::TeamGemm::ParamTag param_tag_type; typedef Algo::Gemm::Blocked algo_tag_type; - test_batched_teamgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp index 89f67e2731..dc3b4e53fb 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp @@ -35,6 +35,7 @@ namespace TeamGesv { template struct Functor_TestBatchedTeamGesv { + using execution_space = typename DeviceType::execution_space; const MatrixType _A; const VectorType _X; const VectorType _B; @@ -62,8 +63,8 @@ struct Functor_TestBatchedTeamGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); using MatrixViewType = Kokkos::View(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { - test_batched_team_gesv(); + test_batched_team_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) { - test_batched_team_gesv(); } TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) { - test_batched_team_gesv(); + test_batched_team_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp index 8657de9856..a62e655d02 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedTeamGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -82,8 +83,8 @@ struct Functor_BatchedTeamGemm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp index 97afe2c1ad..7eb918beef 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_team_inverselu_dcomplex) { // printf("Batched team inverse LU - double complex - algorithm type: // Unblocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Unblocked>(); // printf("Batched team inverse LU - double complex - algorithm type: // Blocked\n"); - test_batched_inverselu, + test_batched_inverselu, Algo::InverseLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp index 74c7efd25b..3939fdd13a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_inverselu_float) { // printf("Batched team inverse LU - float - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched team inverse LU - float - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_inverselu_double) { // printf("Batched team inverse LU - double - algorithm type: Unblocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); // printf("Batched team inverse LU - double - algorithm type: Blocked\n"); - test_batched_inverselu(); + test_batched_inverselu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamLU.hpp b/batched/dense/unit_test/Test_Batched_TeamLU.hpp index 04e191b9cb..e20f3a7411 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU.hpp @@ -34,6 +34,8 @@ namespace TeamLU { template struct Functor_TestBatchedTeamLU { + using execution_space = typename DeviceType::execution_space; + ViewType _a; KOKKOS_INLINE_FUNCTION @@ -60,7 +62,7 @@ struct Functor_TestBatchedTeamLU { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp index e05b521f8c..2c422397e7 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU_Complex.hpp @@ -17,6 +17,6 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_team_lu_dcomplex) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu, algo_tag_type>(); + test_batched_lu, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp index 5e7f05277e..5babaf996c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_lu_float) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_lu_double) { typedef Algo::LU::Blocked algo_tag_type; - test_batched_lu(); + test_batched_lu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp index 41287f9b52..445e10132f 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp @@ -44,6 +44,7 @@ struct ParamTag { template struct Functor_BatchedTeamGemm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -81,14 +82,15 @@ struct Functor_BatchedTeamGemm { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for((name + "::GemmFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } }; template struct Functor_BatchedTeamLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; KOKKOS_INLINE_FUNCTION @@ -113,7 +115,7 @@ struct Functor_BatchedTeamLU { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::LUFunctor").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } @@ -121,6 +123,7 @@ struct Functor_BatchedTeamLU { template struct Functor_TestBatchedTeamSolveLU { + using execution_space = typename DeviceType::execution_space; ViewType _a; ViewType _b; @@ -146,7 +149,7 @@ struct Functor_TestBatchedTeamSolveLU { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for((name + "::SolveLU").c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp index 4076fd5c31..865f58ef43 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Complex.hpp @@ -18,11 +18,11 @@ TEST_F(TestCategory, batched_scalar_team_solvelu_dcomplex) { // printf("Batched team solveLU - double complex - algorithm type: // Unblocked\n"); - test_batched_team_solvelu, + test_batched_team_solvelu, Algo::SolveLU::Unblocked>(); // printf("Batched team solveLU - double complex - algorithm type: // Blocked\n"); - test_batched_team_solvelu, + test_batched_team_solvelu, Algo::SolveLU::Blocked>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp index 4882caabe8..73c55e8a93 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU_Real.hpp @@ -17,17 +17,17 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_solvelu_float) { // printf("Batched team solveLU - float - algorithm type: Unblocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); // printf("Batched team solveLU - float - algorithm type: Blocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_solvelu_double) { // printf("Batched team solveLU - double - algorithm type: Unblocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); // printf("Batched team solveLU - double - algorithm type: Blocked\n"); - test_batched_team_solvelu(); + test_batched_team_solvelu(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp index 2f7781745d..523bd02df4 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp @@ -43,6 +43,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamTrsm { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -74,8 +75,8 @@ struct Functor_TestBatchedTeamTrsm { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp index 7648017287..0cf2761922 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Complex.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_dcomplex) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -29,7 +29,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_dcomplex) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -38,7 +38,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_dcomplex) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -47,7 +47,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_dcomplex) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -56,7 +56,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_dcomplex) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -65,7 +65,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_dcomplex) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -75,7 +75,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -84,7 +84,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -93,7 +93,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_dcomplex) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -102,7 +102,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_dcomplex) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, + test_batched_team_trsm, Kokkos::complex, param_tag_type, algo_tag_type>(); } @@ -112,7 +112,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_dcomplex_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) { @@ -120,7 +120,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_dcomplex_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) { @@ -128,7 +128,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_dcomplex_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) { @@ -136,7 +136,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_dcomplex_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) { @@ -144,7 +144,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_dcomplex_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) { @@ -152,7 +152,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_dcomplex_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } // @@ -161,7 +161,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) { @@ -169,7 +169,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_dcomplex_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) { @@ -177,7 +177,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_dcomplex_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) { @@ -185,7 +185,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_dcomplex_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm, double, + test_batched_team_trsm, double, param_tag_type, algo_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp index d705b42a50..6757617ddd 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm_Real.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_float_float) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) { @@ -28,7 +28,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_float_float) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) { @@ -36,7 +36,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_float_float) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) { @@ -44,7 +44,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_float_float) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) { @@ -52,7 +52,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_float_float) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) { @@ -60,7 +60,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_float_float) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } // @@ -69,7 +69,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) { @@ -77,7 +77,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_float_float) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) { @@ -85,7 +85,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_float_float) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) { @@ -93,7 +93,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_float_float) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } #endif @@ -104,7 +104,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_u_double_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) { @@ -112,7 +112,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_nt_n_double_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) { @@ -120,7 +120,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_u_double_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) { @@ -128,7 +128,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_nt_n_double_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) { @@ -136,7 +136,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_u_double_double) { Trans::NoTranspose, Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) { @@ -144,7 +144,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_r_u_nt_n_double_double) { Trans::NoTranspose, Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } // @@ -153,7 +153,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) { @@ -161,7 +161,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_l_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) { @@ -169,7 +169,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_u_double_double) { Diag::Unit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) { @@ -177,7 +177,7 @@ TEST_F(TestCategory, batched_scalar_team_trsm_l_u_t_n_double_double) { Diag::NonUnit> param_tag_type; typedef Algo::Trsm::Blocked algo_tag_type; - test_batched_team_trsm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp index bb00b78736..400e35deb8 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp @@ -41,6 +41,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamTrsv { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b; ScalarType _alpha; @@ -72,8 +73,8 @@ struct Functor_TestBatchedTeamTrsv { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _b.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp index d01e4b7f94..304e929462 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv_Complex.hpp @@ -19,49 +19,49 @@ // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_dcomplex_dcomplex ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_dcomplex_dcomplex ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_dcomplex_dcomplex ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_u_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_dcomplex_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_trsv,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp index d270a5f4f9..532ed87f4f 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv_Real.hpp @@ -19,25 +19,25 @@ // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_float_float ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_float_float ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_float_float ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } #endif @@ -46,24 +46,24 @@ // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_l_nt_n_double_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_u_double_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } // TEST_F( TestCategory, batched_scalar_team_trsv_u_nt_n_double_double ) { // typedef // ::Test::TeamTrsv::ParamTag // param_tag_type; typedef Algo::Trsv::Blocked algo_tag_type; -// test_batched_team_trsv(); +// test_batched_team_trsv(); // } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp index 5ea8a80717..fca0534b4b 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp @@ -30,6 +30,7 @@ namespace TeamVectorAxpy { template struct Functor_TestBatchedTeamVectorAxpy { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ViewType _X; const ViewType _Y; @@ -66,8 +67,8 @@ struct Functor_TestBatchedTeamVectorAxpy { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp index 161db2d3f5..b1f70a723e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Complex.hpp @@ -16,12 +16,11 @@ #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_dcomplex) { - test_batched_teamvector_axpy, + test_batched_teamvector_axpy, Kokkos::complex>(); } TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_dcomplex_double) { - test_batched_teamvector_axpy, - double>(); + test_batched_teamvector_axpy, double>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp index f81f17046f..15570bc094 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_float_float) { - test_batched_teamvector_axpy(); + test_batched_teamvector_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_axpy_nt_double_double) { - test_batched_teamvector_axpy(); + test_batched_teamvector_axpy(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp index 69cab9c63c..bf907feb96 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition.hpp @@ -74,7 +74,7 @@ name_value_type = ( std::is_same::value ? "::Float" : "::ComplexFloat" : std::is_same >::value ? "::ComplexDouble" : "::UnknownValueType" ); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion( name.c_str() ); - Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO); + Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp index 98ae616bbc..0a71de6bb7 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorEigendecomposition_Real.hpp @@ -16,13 +16,13 @@ /* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_float ) { - test_batched_teamvector_eigendecomposition(); + test_batched_teamvector_eigendecomposition(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_double ) { - test_batched_teamvector_eigendecomposition(); + test_batched_teamvector_eigendecomposition(); } #endif */ diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp index 327f28353e..f2f3bc217d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp @@ -36,6 +36,7 @@ struct ParamTag { template struct Functor_TestBatchedTeamVector { + using execution_space = typename DeviceType::execution_space; ViewType _a, _b, _c; ScalarType _alpha, _beta; @@ -68,8 +69,8 @@ struct Functor_TestBatchedTeamVector { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp index 9250896194..cc6cbdd511 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Complex.hpp @@ -19,8 +19,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_scomplex_scomplex) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -28,8 +28,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_scomplex_scomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -37,8 +37,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_scomplex_scomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -46,8 +46,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_scomplex_scomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -59,8 +59,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_dcomplex_dcomplex) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -68,8 +68,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_dcomplex_dcomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -77,8 +77,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_dcomplex_dcomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } @@ -86,8 +86,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_dcomplex_dcomplex) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); - test_batched_teamvectorgemm, + // test_batched_teamvectorgemm,Kokkos::complex,param_tag_type,Algo::Gemm::Blocked>(); + test_batched_teamvectorgemm, Kokkos::complex, param_tag_type, Algo::Gemm::Unblocked>(); } diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp index b8ad094f8e..e96bc1ac5c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm_Real.hpp @@ -19,8 +19,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_bhalf_bhalf) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -28,8 +28,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_bhalf_bhalf) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -37,8 +37,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_bhalf_bhalf) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -46,8 +46,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_bhalf_bhalf) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -59,8 +59,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_half_half) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -68,8 +68,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_half_half) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -77,8 +77,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_half_half) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -86,8 +86,8 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_half_half) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } @@ -99,32 +99,32 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_float_float) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_float_float) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_float_float) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_float_float) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif @@ -135,32 +135,32 @@ TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_nt_double_double) { Trans::NoTranspose> param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_nt_double_double) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_nt_t_double_double) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } TEST_F(TestCategory, batched_scalar_team_vector_gemm_t_t_double_double) { typedef ::Test::TeamVectorGemm::ParamTag param_tag_type; - // test_batched_teamvectorgemm(); - test_batched_teamvectorgemm(); + test_batched_teamvectorgemm(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp index 2026f2f81d..ddb1a5c40d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp @@ -35,6 +35,7 @@ namespace TeamVectorGesv { template struct Functor_TestBatchedTeamVectorGesv { + using execution_space = typename DeviceType::execution_space; const MatrixType _A; const VectorType _X; const VectorType _B; @@ -63,8 +64,8 @@ struct Functor_TestBatchedTeamVectorGesv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), - Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); using MatrixViewType = Kokkos::View(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) { - test_batched_teamvector_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) { - test_batched_teamvector_gesv(); } TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) { - test_batched_teamvector_gesv(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp index 58d305f494..84ccb39611 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorQR { + using execution_space = typename DeviceType::execution_space; MatrixViewType _a; VectorViewType _x, _b, _t; WorkViewType _w; @@ -99,7 +100,7 @@ struct Functor_TestBatchedTeamVectorQR { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp index 54c0388d17..d79d868bc1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_qr_float) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr(); + test_batched_qr(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_qr_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_qr_double) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr(); + test_batched_qr(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index c86d4e86a8..09427aa25e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { + using execution_space = typename DeviceType::execution_space; MatrixViewType _a; VectorViewType _x, _b, _t; PivotViewType _p; @@ -108,7 +109,7 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp index 81e010a895..35713ac7f1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting_Real.hpp @@ -17,8 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_float) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr_with_columnpivoting(); + test_batched_qr_with_columnpivoting(); } #endif @@ -27,8 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_float) { #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_qr_with_columnpivoting_double) { typedef Algo::QR::Unblocked algo_tag_type; - test_batched_qr_with_columnpivoting(); + test_batched_qr_with_columnpivoting(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp index 29496c1b87..2f30c7d3c1 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorSolveUTV { + using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; PivViewType _p; VectorViewType _x, _b; @@ -121,7 +122,7 @@ struct Functor_TestBatchedTeamVectorSolveUTV { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp index 45d6093f2a..cf7084a92c 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp @@ -35,6 +35,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorSolveUTV2 { + using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; PivViewType _p; VectorViewType _x, _b; @@ -125,7 +126,7 @@ struct Functor_TestBatchedTeamVectorSolveUTV2 { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp index 52b8f263c3..c8e547d1d0 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_float) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv2(); + test_batched_solve_utv2(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_float) { #ifndef KOKKOS_ENABLE_SYCL TEST_F(TestCategory, batched_scalar_teamvector_solve_utv2_double) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv2(); + test_batched_solve_utv2(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp index b2bc52dafb..a3b5bcec29 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_float) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv(); + test_batched_solve_utv(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_float) { #ifndef KOKKOS_ENABLE_SYCL TEST_F(TestCategory, batched_scalar_teamvector_solve_utv_double) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_solve_utv(); + test_batched_solve_utv(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp index 527c93e059..eb45a70c89 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp @@ -34,6 +34,7 @@ namespace Test { template struct Functor_TestBatchedTeamVectorUTV { + using execution_space = typename DeviceType::execution_space; MatrixViewType _r, _a, _acopy, _u, _v; PivViewType _p; VectorViewType _x, _b; @@ -155,7 +156,7 @@ struct Functor_TestBatchedTeamVectorUTV { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp index 980f0ebf75..7e9a8feafe 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV_Real.hpp @@ -17,7 +17,7 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_utv_float) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_utv(); + test_batched_utv(); } #endif @@ -26,7 +26,7 @@ TEST_F(TestCategory, batched_scalar_teamvector_utv_float) { #ifndef KOKKOS_ENABLE_SYCL TEST_F(TestCategory, batched_scalar_teamvector_utv_double) { typedef Algo::UTV::Unblocked algo_tag_type; - test_batched_utv(); + test_batched_utv(); } #endif #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp index 1006325f94..9d1205717f 100644 --- a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp @@ -265,96 +265,96 @@ int test_batched_complex_real_imag_value() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_arithmatic_simd_float3) { - test_batched_vector_arithmatic, 3>(); + test_batched_vector_arithmatic, 3>(); } TEST_F(TestCategory, batched_vector_arithmatic_simd_float4) { - test_batched_vector_arithmatic, 4>(); + test_batched_vector_arithmatic, 4>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_float8) { - test_batched_vector_arithmatic, 8>(); + test_batched_vector_arithmatic, 8>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_float16) { - test_batched_vector_arithmatic, 16>(); + test_batched_vector_arithmatic, 16>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_arithmatic_simd_double3) { - test_batched_vector_arithmatic, 3>(); + test_batched_vector_arithmatic, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_double4) { - test_batched_vector_arithmatic, 4>(); + test_batched_vector_arithmatic, 4>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_double8) { - test_batched_vector_arithmatic, 8>(); + test_batched_vector_arithmatic, 8>(); } #endif #define __DO_NOT_TEST__ #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex3) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex4) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 4>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_scomplex8) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 8>(); } TEST_F(TestCategory, batched_vector_scomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_scomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_scomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex3) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 3>(); } // avx TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex2) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_arithmatic_simd_dcomplex4) { - test_batched_vector_arithmatic >, + test_batched_vector_arithmatic >, 4>(); } TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value3) { - test_batched_complex_real_imag_value >, 3>(); } // avx TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value2) { - test_batched_complex_real_imag_value >, 2>(); } // avx 512 TEST_F(TestCategory, batched_vector_dcomplex_real_imag_value4) { - test_batched_complex_real_imag_value >, 4>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp index 9393afd77b..5ab10bb5bd 100644 --- a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp @@ -114,37 +114,37 @@ int test_batched_vector_logical() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_logical_simd_float3) { - test_batched_vector_logical(); + test_batched_vector_logical(); } TEST_F(TestCategory, batched_vector_logical_simd_float8) { - test_batched_vector_logical(); + test_batched_vector_logical(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_logical_simd_double3) { - test_batched_vector_logical(); + test_batched_vector_logical(); } TEST_F(TestCategory, batched_vector_logical_simd_double4) { - test_batched_vector_logical(); + test_batched_vector_logical(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_logical_simd_scomplex3 ) { -// test_batched_vector_logical,3>(); +// test_batched_vector_logical,3>(); // } // TEST_F( TestCategory, batched_vector_logical_simd_scomplex4 ) { -// test_batched_vector_logical,4>(); +// test_batched_vector_logical,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_logical_simd_dcomplex3 ) { -// test_batched_vector_logical,3>(); +// test_batched_vector_logical,3>(); // } // TEST_F( TestCategory, batched_vector_logical_simd_dcomplex2 ) { -// test_batched_vector_logical,2>(); +// test_batched_vector_logical,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorMath.hpp b/batched/dense/unit_test/Test_Batched_VectorMath.hpp index d2aa9eb7bc..02c943d587 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMath.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMath.hpp @@ -157,19 +157,19 @@ int test_batched_vector_math() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_math_simd_float3) { - test_batched_vector_math, 3>(); + test_batched_vector_math, 3>(); } TEST_F(TestCategory, batched_vector_math_simd_float8) { - test_batched_vector_math, 8>(); + test_batched_vector_math, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_math_simd_double3) { - test_batched_vector_math, 3>(); + test_batched_vector_math, 3>(); } TEST_F(TestCategory, batched_vector_math_simd_double4) { - test_batched_vector_math, 4>(); + test_batched_vector_math, 4>(); } #endif @@ -178,20 +178,20 @@ TEST_F(TestCategory, batched_vector_math_simd_double4) { // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_math_simd_scomplex3 ) { // test_complex_pow(); -// test_batched_vector_math >,3>(); +// test_batched_vector_math >,3>(); // } // TEST_F( TestCategory, batched_vector_math_simd_scomplex4 ) { -// test_batched_vector_math >,4>(); +// test_batched_vector_math >,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_math_simd_dcomplex3 ) { // test_complex_pow(); -// test_batched_vector_math >,3>(); +// test_batched_vector_math >,3>(); // } // TEST_F( TestCategory, batched_vector_math_simd_dcomplex2 ) { -// test_batched_vector_math >,2>(); +// test_batched_vector_math >,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp index 70d0e10cd2..5f176ccba8 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp @@ -173,37 +173,37 @@ int test_batched_vector_misc() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_misc_simd_float3) { - test_batched_vector_misc, 3>(); + test_batched_vector_misc, 3>(); } TEST_F(TestCategory, batched_vector_misc_simd_float8) { - test_batched_vector_misc, 8>(); + test_batched_vector_misc, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_misc_simd_double3) { - test_batched_vector_misc, 3>(); + test_batched_vector_misc, 3>(); } TEST_F(TestCategory, batched_vector_misc_simd_double4) { - test_batched_vector_misc, 4>(); + test_batched_vector_misc, 4>(); } #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_misc_simd_scomplex3 ) { -// test_batched_vector_misc >,3>(); +// test_batched_vector_misc >,3>(); // } // TEST_F( TestCategory, batched_vector_misc_simd_scomplex4 ) { -// test_batched_vector_misc >,4>(); +// test_batched_vector_misc >,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_misc_simd_dcomplex3 ) { -// test_batched_vector_misc >,3>(); +// test_batched_vector_misc >,3>(); // } // TEST_F( TestCategory, batched_vector_misc_simd_dcomplex2 ) { -// test_batched_vector_misc >,2>(); +// test_batched_vector_misc >,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp index 54eb2938e5..1aff1b2d0f 100644 --- a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp @@ -127,19 +127,19 @@ int test_batched_vector_relation() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_relation_simd_float3) { - test_batched_vector_relation, 3>(); + test_batched_vector_relation, 3>(); } TEST_F(TestCategory, batched_vector_relation_simd_float8) { - test_batched_vector_relation, 8>(); + test_batched_vector_relation, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_relation_simd_double3) { - test_batched_vector_relation, 3>(); + test_batched_vector_relation, 3>(); } TEST_F(TestCategory, batched_vector_relation_simd_double4) { - test_batched_vector_relation, 4>(); + test_batched_vector_relation, 4>(); } #endif @@ -147,14 +147,14 @@ TEST_F(TestCategory, batched_vector_relation_simd_double4) { // #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) // TEST_F( TestCategory, batched_vector_relation_simd_scomplex4 ) { -// test_batched_vector_relation +// test_batched_vector_relation // >,4>(); // } // #endif // #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) // TEST_F( TestCategory, batched_vector_relation_simd_dcomplex2 ) { -// test_batched_vector_relation +// test_batched_vector_relation // >,2>(); // } // #endif diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index 793c4ac3f3..74c7748cba 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -356,31 +356,31 @@ int test_batched_vector_view() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_vector_view_simd_float8) { - test_batched_vector_view, 8>(); + test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_vector_view_simd_double4) { - test_batched_vector_view, 4>(); + test_batched_vector_view, 4>(); } TEST_F(TestCategory, batched_vector_view_simd_double8) { - test_batched_vector_view, 8>(); + test_batched_vector_view, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, batched_vector_view_simd_scomplex4) { - test_batched_vector_view >, 4>(); + test_batched_vector_view >, 4>(); } TEST_F(TestCategory, batched_vector_view_simd_scomplex8) { - test_batched_vector_view >, 8>(); + test_batched_vector_view >, 8>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) { - test_batched_vector_view >, 2>(); + test_batched_vector_view >, 2>(); } #if defined(KOKKOS_COMPILER_INTEL) && \ @@ -392,7 +392,7 @@ TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { } #else TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { - test_batched_vector_view >, 4>(); + test_batched_vector_view >, 4>(); } #endif // KOKKOS_COMPILER_INTEL #endif // KOKKOSKERNELS_INST_COMPLEX_DOUBLE diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index b7527d923c..b96dc79a80 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -153,49 +153,95 @@ struct SerialSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and alpha do not match: " + "X: %d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } if (X.extent(0) != beta.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and beta do not match: X: " + "%d x %d, beta: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif @@ -243,35 +289,67 @@ struct SerialSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 2b62be1e5a..d7379777be 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -341,49 +341,95 @@ struct TeamVectorSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and alpha do not match: " + "X: %d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } if (X.extent(0) != beta.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and beta do not match: X: " + "%d x %d, beta: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif @@ -438,35 +484,67 @@ struct TeamVectorSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index c46ef7edc7..beb53521f0 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -192,49 +192,95 @@ struct TeamSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != alpha.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and alpha do not match: " "X: %d x %d, alpha: %d\n", (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and alpha do not match: " + "X: %d x %d, alpha: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)alpha.extent(0)); +#endif return 1; } if (X.extent(0) != beta.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and beta do not match: X: " "%d x %d, beta: %d\n", (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and beta do not match: X: " + "%d x %d, beta: %d\n", + (int)X.extent(0), (int)X.extent(1), (int)beta.extent(0)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif @@ -289,35 +335,67 @@ struct TeamSpmv { // Check compatibility of dimensions at run time. if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " "%d, Y: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), (int)Y.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimensions of X and Y do not match: X: %d x " + "%d, Y: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)Y.extent(0), + (int)Y.extent(1)); +#endif return 1; } if (X.extent(0) != values.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: First dimension of X and the first dimension " "of values do not match: X: %d x %d, values: %d x %d\n", (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: First dimension of X and the first dimension " + "of values do not match: X: %d x %d, values: %d x %d\n", + (int)X.extent(0), (int)X.extent(1), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (colIndices.extent(0) != values.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of colIndices and the second " "dimension of values do not match: colIndices: %d , values: %d x " "%d\n", (int)colIndices.extent(0), (int)values.extent(0), (int)values.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of colIndices and the second " + "dimension of values do not match: colIndices: %d , values: %d x " + "%d\n", + (int)colIndices.extent(0), (int)values.extent(0), + (int)values.extent(1)); +#endif return 1; } if (row_ptr.extent(0) - 1 != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " "of X do not match: colIndices (-1): %d , values: %d x %d\n", (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#else + Kokkos::printf( + "KokkosBatched::spmv: Dimension of row_ptr and the second dimension " + "of X do not match: colIndices (-1): %d , values: %d x %d\n", + (int)row_ptr.extent(0) - 1, (int)X.extent(0), (int)X.extent(1)); +#endif return 1; } #endif diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 728bb2d921..44a982525d 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -109,10 +109,17 @@ class JacobiPrec { } if (tooSmall > 0) +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); +#else + Kokkos::printf( + "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " + "magnitude and have been replaced by one, \n", + (int)tooSmall); +#endif computed_inverse = true; } @@ -131,10 +138,17 @@ class JacobiPrec { } if (tooSmall > 0) +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " "magnitude and have been replaced by one, \n", (int)tooSmall); +#else + Kokkos::printf( + "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " + "magnitude and have been replaced by one, \n", + (int)tooSmall); +#endif computed_inverse = true; } @@ -168,4 +182,4 @@ class JacobiPrec { } // namespace KokkosBatched -#endif \ No newline at end of file +#endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp index 45b6a71f99..e28efb9b82 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp @@ -32,6 +32,7 @@ namespace GMRES { template struct Functor_TestBatchedSerialGMRES { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -85,7 +86,7 @@ struct Functor_TestBatchedSerialGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _D.extent(0) / _N_team); + Kokkos::RangePolicy policy(0, _D.extent(0) / _N_team); const int N = _D.extent(0); const int n = _X.extent(1); diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp index 6201e29ebc..ccfe3c37d5 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { - test_batched_serial_GMRES(); + test_batched_serial_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { - test_batched_serial_GMRES(); + test_batched_serial_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp index 338a93d0eb..05f2724c5b 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp @@ -41,6 +41,7 @@ template struct Functor_TestBatchedSerialSpmv { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ValuesViewType _D; const IntView _r; @@ -75,7 +76,7 @@ struct Functor_TestBatchedSerialSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _D.extent(0)); + Kokkos::RangePolicy policy(0, _D.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp index bba455fef7..06c8c2695d 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_serial_spmv_nt_float_float) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_spmv(); + test_batched_spmv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_spmv_nt_double_double) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_spmv(); + test_batched_spmv(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp index 41fa682bdd..b05f3db61f 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp @@ -31,6 +31,7 @@ namespace TeamCG { template struct Functor_TestBatchedTeamCG { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -79,8 +80,8 @@ struct Functor_TestBatchedTeamCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp index beb2a078e7..1bdb6bc95a 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_CG_float) { - test_batched_team_CG(); + test_batched_team_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_CG_double) { - test_batched_team_CG(); + test_batched_team_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp index 2b7ab73790..de1a7f4fc2 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp @@ -32,6 +32,7 @@ namespace TeamGMRES { template struct Functor_TestBatchedTeamGMRES { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -91,8 +92,8 @@ struct Functor_TestBatchedTeamGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp index f40452b952..f8aab13eec 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_GMRES_float) { - test_batched_team_GMRES(); + test_batched_team_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_GMRES_double) { - test_batched_team_GMRES(); + test_batched_team_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp index 5c077f75ed..a6c9ac7ea8 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp @@ -42,6 +42,7 @@ template struct Functor_TestBatchedTeamSpmv { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ValuesViewType _D; const IntView _r; @@ -99,7 +100,7 @@ struct Functor_TestBatchedTeamSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( + Kokkos::TeamPolicy policy( _D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp index de3f6168a9..d815ee7b12 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_team_spmv_nt_float_float) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_team_spmv(); + test_batched_team_spmv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_team_spmv_nt_double_double) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_team_spmv(); + test_batched_team_spmv(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp index abadf27953..3ffd68209b 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp @@ -31,6 +31,7 @@ namespace TeamVectorCG { template struct Functor_TestBatchedTeamVectorCG { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -81,8 +82,8 @@ struct Functor_TestBatchedTeamVectorCG { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp index e3b34ca594..859a1a885c 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_CG_float) { - test_batched_teamvector_CG(); + test_batched_teamvector_CG(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_CG_double) { - test_batched_teamvector_CG(); + test_batched_teamvector_CG(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp index f4f208a829..084b623aa2 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp @@ -32,6 +32,7 @@ namespace TeamVectorGMRES { template struct Functor_TestBatchedTeamVectorGMRES { + using execution_space = typename DeviceType::execution_space; const ValuesViewType _D; const IntView _r; const IntView _c; @@ -91,8 +92,8 @@ struct Functor_TestBatchedTeamVectorGMRES { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, + Kokkos::AUTO(), Kokkos::AUTO()); const int N = _D.extent(0); const int n = _X.extent(1); diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp index e36ee2b67c..53b740deaa 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES_Real.hpp @@ -16,12 +16,12 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_GMRES_float) { - test_batched_teamvector_GMRES(); + test_batched_teamvector_GMRES(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_GMRES_double) { - test_batched_teamvector_GMRES(); + test_batched_teamvector_GMRES(); } #endif diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp index 67d944b159..9cbba56370 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp @@ -42,6 +42,7 @@ template struct Functor_TestBatchedTeamVectorSpmv { + using execution_space = typename DeviceType::execution_space; const alphaViewType _alpha; const ValuesViewType _D; const IntView _r; @@ -106,7 +107,7 @@ struct Functor_TestBatchedTeamVectorSpmv { const std::string name_value_type = Test::value_type_name(); std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy policy( + Kokkos::TeamPolicy policy( ceil(static_cast(_D.extent(0)) / _N_team), Kokkos::AUTO(), Kokkos::AUTO()); Kokkos::parallel_for(name.c_str(), policy, *this); diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp index 709dea5df1..05d6dcd316 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv_Real.hpp @@ -17,13 +17,13 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, batched_scalar_teamvector_spmv_nt_float_float) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_teamvector_spmv(); + test_batched_teamvector_spmv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_teamvector_spmv_nt_double_double) { typedef ::Test::Spmv::ParamTag param_tag_type; - test_batched_teamvector_spmv(); + test_batched_teamvector_spmv(); } #endif diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 04f883c21a..869b152e7b 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -101,13 +101,6 @@ KOKKOSKERNELS_GENERATE_ETI(Blas1_dot_mv dot TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas_gesv gesv - COMPONENTS blas - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES -) - KOKKOSKERNELS_GENERATE_ETI(Blas1_axpby axpby COMPONENTS blas HEADER_LIST ETI_HEADERS @@ -297,28 +290,28 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_ger ger TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm +KOKKOSKERNELS_GENERATE_ETI(Blas2_syr syr COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas3_trsm trsm +KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm +KOKKOSKERNELS_GENERATE_ETI(Blas3_trsm trsm COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) -KOKKOSKERNELS_GENERATE_ETI(Blas_trtri trtri +KOKKOSKERNELS_GENERATE_ETI(Blas3_trmm trmm COMPONENTS blas HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES diff --git a/blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in similarity index 90% rename from blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in rename to blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in index 32473be3ad..00cbe2f171 100644 --- a/blas/eti/generated_specializations_cpp/gesv/KokkosBlas_gesv_eti_spec_inst.cpp.in +++ b/blas/eti/generated_specializations_cpp/syr/KokkosBlas2_syr_eti_spec_inst.cpp.in @@ -14,13 +14,12 @@ // //@HEADER - #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosBlas_gesv_spec.hpp" +#include "KokkosBlas2_syr_spec.hpp" namespace KokkosBlas { namespace Impl { -@BLAS_GESV_ETI_INST_BLOCK@ - } //IMPL +@BLAS2_SYR_ETI_INST_BLOCK@ +} //IMPL } //Kokkos diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in similarity index 83% rename from blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in rename to blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in index ae262c912e..d789bcd6ef 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_avail.hpp.in @@ -14,11 +14,12 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL_HPP_ +#ifndef KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL_HPP_ + namespace KokkosBlas { namespace Impl { -@BLAS_GESV_ETI_AVAIL_BLOCK@ - } //IMPL +@BLAS2_SYR_ETI_AVAIL_BLOCK@ +} //IMPL } //Kokkos #endif diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 42e2465494..08842a61c0 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -27,7 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemv_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +44,7 @@ struct gemv_eti_spec_avail { #define KOKKOSBLAS2_GEMV_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct gemv_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -67,14 +68,14 @@ namespace Impl { // // Implementation of KokkosBlas::gemv. -template ::value, - bool eti_spec_avail = - gemv_eti_spec_avail::value> +template < + class ExecutionSpace, class AViewType, class XViewType, class YViewType, + bool tpl_spec_avail = gemv_tpl_spec_avail::value, + bool eti_spec_avail = gemv_eti_spec_avail::value> struct GEMV { - static void gemv(const typename AViewType::execution_space& space, - const char trans[], + static void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -130,6 +131,7 @@ struct GEMV { #define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct GEMV< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -142,6 +144,7 @@ struct GEMV { #define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct GEMV< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index fa2220e00a..651db7f11a 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -25,17 +25,17 @@ namespace KokkosBlas { namespace Impl { -// Functor for a single-level parallel_for version of nontranspose GER. -// The functor parallelizes over rows of the input matrix A. +// Functor for the thread parallel version of GER. +// This functor parallelizes over rows of the input matrix A. template -struct SingleLevelGER { +struct ThreadParallelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - SingleLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + ThreadParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } @@ -69,13 +69,13 @@ struct SingleLevelGER { AViewType A_; }; -// Single-level parallel version of GER. +// Thread parallel version of GER. template -void singleLevelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, - const AViewType& A) { +void threadParallelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -90,22 +90,23 @@ void singleLevelGer(const ExecutionSpace& space, const char trans[], } else { Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); - SingleLevelGER functor( + ThreadParallelGER functor( (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[SingleLevel]", rangePolicy, functor); + Kokkos::parallel_for("KokkosBlas::ger[threadParallel]", rangePolicy, + functor); } } -struct TwoLevelGER_LayoutLeftTag {}; -struct TwoLevelGER_LayoutRightTag {}; +struct TeamParallelGER_LayoutLeftTag {}; +struct TeamParallelGER_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- -// Functor for a two-level parallel_reduce version of GER, designed for -// performance on GPU. Kernel depends on the layout of A. +// Functor for the team parallel version of GER, designed for +// performance on GPU. The kernel depends on the layout of A. template -struct TwoLevelGER { +struct TeamParallelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; @@ -114,15 +115,15 @@ struct TwoLevelGER { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - TwoLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) + TeamParallelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutLeftTag, + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutLeftTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do @@ -147,7 +148,7 @@ struct TwoLevelGER { } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutRightTag, + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelGER_LayoutRightTag, const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do @@ -169,7 +170,6 @@ struct TwoLevelGER { }); } } - team.team_barrier(); } private: @@ -180,12 +180,13 @@ struct TwoLevelGER { AViewType A_; }; -// Two-level parallel version of GER. +// Team parallel version of GER. template -void twoLevelGer(const ExecutionSpace& space, const char trans[], - const typename AViewType::const_value_type& alpha, - const XViewType& x, const YViewType& y, const AViewType& A) { +void teamParallelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { static_assert(std::is_integral::value, "IndexType must be an integer"); @@ -205,8 +206,8 @@ void twoLevelGer(const ExecutionSpace& space, const char trans[], constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = - typename std::conditional::type; + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { @@ -217,15 +218,17 @@ void twoLevelGer(const ExecutionSpace& space, const char trans[], teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelGER + TeamParallelGER functor((trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); - Kokkos::parallel_for("KokkosBlas::ger[twoLevel]", teamPolicy, functor); + Kokkos::parallel_for("KokkosBlas::ger[teamParallel]", teamPolicy, functor); } // --------------------------------------------------------------------------------------------- -// generalGer: use 1 level (Range) or 2 level (Team) implementation, -// depending on whether execution space is CPU or GPU. +// generalGerImpl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// // The 'enable_if' makes sure unused kernels are not instantiated. template +struct ThreadParallelSYR { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + ThreadParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, + const AViewType& A) + : alpha_(alpha), x_(x), A_(A) { + // Nothing to do + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { + // Condition 'alpha_ == zero' has already been checked. + if (x_(i) == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const XComponentType x_fixed(x_(i)); + const IndexType N(A_.extent(1)); + + if constexpr (tJustTranspose) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); + } + } + } else { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + AViewType A_; +}; + +// Thread parallel version of SYR. +template +void threadParallelSyr(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + ThreadParallelSYR + functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, + functor); + } +} + +struct TeamParallelSYR_LayoutLeftTag {}; +struct TeamParallelSYR_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for the team parallel version of SYR, designed for +// performance on GPUs. The kernel depends on the layout of A. +template +struct TeamParallelSYR { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TeamParallelSYR(const AlphaCoeffType& alpha, const XViewType& x, + const AViewType& A) + : alpha_(alpha), x_(x), A_(A) { + // Nothing to do + } + + public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutLeftTag, + const member_type& team) const { + // Condition 'alpha_ == zero' has already been checked + const IndexType j(team.league_rank()); + if (x_(j) == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType M(A_.extent(0)); + if constexpr (tJustTranspose) { + const XComponentType x_fixed(x_(j)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); + } else { + const XComponentType x_fixed( + Kokkos::ArithTraits::conj(x_(j))); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * x_fixed); + } + }); + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR_LayoutRightTag, + const member_type& team) const { + // Condition 'alpha_ == zero' has already been checked + const IndexType i(team.league_rank()); + if (x_(i) == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); + if constexpr (tJustTranspose) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * x_(j)); + } + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + AViewType A_; +}; + +// Team parallel version of SYR. +template +void teamParallelSyr(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + return; + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + return; + } + + constexpr bool isLayoutLeft = + std::is_same_v; + using layout_tag = + typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TeamParallelSYR + functor(alpha, x, A); + Kokkos::parallel_for("KokkosBlas::syr[teamParallel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalSyrImpl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// +// The 'enable_if' makes sure unused kernels are not instantiated. + +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + threadParallelSyr(space, alpha, x, A); +} + +template ()>::type* = nullptr> +void generalSyrImpl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) { + teamParallelSyr(space, alpha, x, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp new file mode 100644 index 0000000000..b07c3a1446 --- /dev/null +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -0,0 +1,170 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_SPEC_HPP_ +#define KOKKOSBLAS2_SYR_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::SYR. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_SYR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// syr +// + +// Implementation of KokkosBlas::syr. +template ::value, + bool eti_spec_avail = + syr_eti_spec_avail::value> +struct SYR { + static void syr(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const AViewType& A) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::syr[ETI]" + : "KokkosBlas::syr[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); + bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); + + // Prefer int as the index type, but use a larsyr type if needed. + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + if (justTranspose) { + if (justUp) { + generalSyrImpl( + space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); + } + } else { + if (justUp) { + generalSyrImpl(space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); + } + } + } else { + if (justTranspose) { + if (justUp) { + generalSyrImpl(space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); + } + } else { + if (justUp) { + generalSyrImpl(space, alpha, x, A); + } else { + generalSyrImpl(space, alpha, x, A); + } + } + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::SYR. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. +// +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS2_SYR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include + +#endif // KOKKOSBLAS2_SYR_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index 4f3e62f343..1a0ab46bb3 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -49,8 +49,7 @@ struct impl_gemm_choose_copy_layout { #ifdef KOKKOS_ENABLE_HIP template -struct impl_gemm_choose_copy_layout { +struct impl_gemm_choose_copy_layout { using type = LayoutA; }; #endif diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index c340a41fc1..367a8dad3f 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -192,7 +192,7 @@ struct GEMM { team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) diff --git a/blas/impl/KokkosBlas_Newton_impl.hpp b/blas/impl/KokkosBlas_Newton_impl.hpp deleted file mode 100644 index db4b8a3a43..0000000000 --- a/blas/impl/KokkosBlas_Newton_impl.hpp +++ /dev/null @@ -1,212 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef __KOKKOSBATCHED_ODE_NEWTON_HPP__ -#define __KOKKOSBATCHED_ODE_NEWTON_HPP__ - -#include "Kokkos_Core.hpp" -#include "KokkosBatched_LU_Decl.hpp" -#include "KokkosBatched_LU_Serial_Impl.hpp" -#include "KokkosBatched_Gesv.hpp" -#include "KokkosBlas1_nrm2.hpp" -#include "KokkosBlas1_scal.hpp" -#include "KokkosBlas1_axpby.hpp" - -namespace KokkosBlas { -namespace Impl { - -enum class NewtonSolverStatus { Converged = 0, LinearSolveFailure, MaxIters }; - -std::ostream& operator<<(std::ostream& os, NewtonSolverStatus& status) { - switch (status) { - case NewtonSolverStatus::Converged: os << "Newton Solver Converged!"; break; - case NewtonSolverStatus::LinearSolveFailure: - os << "Newton: Linear Solver Failure"; - break; - case NewtonSolverStatus::MaxIters: - os << "Newton reached maximum iterations without convergence."; - break; - } - return os; -} - -/// \brief NewtonHandle -/// -/// This handle is used to pass information between the Newton Solver and -/// the calling code. -/// -/// \tparam: NormViewType: Type of view used to store the residual convergence -/// history - -template -struct NewtonHandle { - using norm_type = typename NormViewType::non_const_value_type; - - NormViewType lastResidual; // Residual of last successful iteration - typename NormViewType::HostMirror lastResidualHost; - - // NormViewType residual_norms; - // TODO: Making these public for now. Should make private and access - // via setters and getters? - int maxIters; // Maximum number of Newton steps - norm_type relativeTol; // Relative convergence tolerance - bool debug_mode; // Returns extra verbose output if true. - - NewtonHandle(int _maxIters = 25, double _relativeTol = 1.0e-6, - bool _debug = false) - : lastResidual("ending Residual norm", 1), - lastResidualHost("end res norm host", 1), - maxIters(_maxIters), - relativeTol(_relativeTol), - debug_mode(_debug) {} - - KOKKOS_FUNCTION - void set_residual(const norm_type val) const { lastResidual(0) = val; } - - KOKKOS_FUNCTION - norm_type get_residual() const { return lastResidual(0); } - - norm_type get_residual_host() const { - Kokkos::deep_copy(lastResidualHost, lastResidual); - return lastResidualHost(0); - } - -}; // NewtonHandle - -/// \brief Newton Functor: -/// Solves the nonlinear system F(x) = 0 -/// where F is a map from R^n to R^n. -/// \tparam System: Struct that allows the evaluation -/// of the residual and jacobian using the -/// residual() and jacobian() methods. -/// \tparam Matrix: rank-2 view-type -/// \tparam XVector: rank-1 view-type -/// \tparam YVector: rank-1 view-type -/// \param -/// \param X [in]: Input vector X, a rank 1 view -/// \param Y [in/out]: Output vector Y, a rank 1 view -/// -/// No nested parallel_for is used inside of the function. -/// -template -struct NewtonFunctor { - using execution_space = typename YVector::execution_space; - using yvalue_type = typename YVector::non_const_value_type; - using norm_type = typename NewtonHandleType::norm_type; - - System sys; - XVector x; - YVector rhs; - NewtonHandleType handle; - - Matrix J, tmp; - XVector update; - - NewtonFunctor(System _sys, XVector _x, YVector _rhs, - NewtonHandleType& _handle) - : sys(_sys), x(_x), rhs(_rhs), handle(_handle) { - J = Matrix("Jacobian", x.extent(0), x.extent(0)); - tmp = Matrix("Jacobian", x.extent(0), x.extent(0) + 4); - update = XVector("update", x.extent(0)); - } - - KOKKOS_INLINE_FUNCTION - NewtonSolverStatus solve() const { - norm_type norm = Kokkos::ArithTraits::zero(); - yvalue_type alpha = Kokkos::ArithTraits::one(); - handle.set_residual(-1); // init to dummy value - - // Iterate until maxIts or the tolerance is reached - for (int it = 0; it < handle.maxIters; ++it) { - // compute initial rhs - sys.residual(x, rhs); - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("NewtonFunctor: r="); - for (int k = 0; k < rhs.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", rhs(k)); - } - } - - // Solve the following linearized - // problem at each step: J*update=-rhs - // with J=du/dx, rhs=f(u_n+update)-f(u_n) - norm = KokkosBlas::serial_nrm2(rhs); - handle.set_residual(norm); - - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Iteration: %d Current res norm is: %e \n Current " - "soln is:\n", - it, (double)handle.get_residual()); - for (int k = 0; k < x.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); - } - } - - if (norm < handle.relativeTol) { - // Problem solved, exit the functor - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Newton solver converged! Ending norm is: %e \n " - "Solution x is: " - "\n", - norm); - for (int k = 0; k < x.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); - } - } - return NewtonSolverStatus::Converged; - } - - // compute LHS - sys.jacobian(x, J); - - // solve linear problem - int linSolverStat = KokkosBatched::SerialGesv< - KokkosBatched::Gesv::StaticPivoting>::invoke(J, update, rhs, tmp); - KokkosBlas::SerialScale::invoke(-1, update); - - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Print linear solve solution: \n"); - for (int k = 0; k < update.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", update(k)); - } - } - if (linSolverStat == 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Linear solve gesv returned failure! \n"); - return NewtonSolverStatus::LinearSolveFailure; - } - - // update solution // x = x + alpha*update - KokkosBlas::serial_axpy(alpha, update, x); - if (handle.debug_mode) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "NewtonFunctor: Print updated solution: \n"); - for (int k = 0; k < x.extent_int(0); k++) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); - } - } - } - return NewtonSolverStatus::MaxIters; - } // End solve functor. -}; - -} // namespace Impl -} // namespace KokkosBlas -#endif // __KOKKOSBATCHED_ODE_NEWTON_HPP__ diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 47fa1f536f..32ede3090c 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -40,8 +40,6 @@ namespace KokkosBlas { /// \param alpha [in] The scalar to apply to A. /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. -/// -/// \return Y = gamma * Y + alpha * A * X. template void mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, @@ -139,8 +137,6 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, /// \param alpha [in] The scalar to apply to A. /// \param A [in] The vector to apply to X. /// \param X [in] The X vector. -/// -/// \return Y = gamma * Y + alpha * A * X. template void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) { diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 67cdde17fa..64643367a0 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -241,10 +241,17 @@ KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { " Kokkos::ArithTraits::mag_type"); if (R.extent(0) != X.extent(1)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," " R: %d and X: %d x %d.\n", R.extent_int(0), X.extent_int(0), X.extent_int(1)); +#else + Kokkos::printf( + "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," + " R: %d and X: %d x %d.\n", + R.extent_int(0), X.extent_int(0), X.extent_int(1)); +#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/blas/src/KokkosBlas1_swap.hpp b/blas/src/KokkosBlas1_swap.hpp index f91d090cd5..26c529f3b7 100644 --- a/blas/src/KokkosBlas1_swap.hpp +++ b/blas/src/KokkosBlas1_swap.hpp @@ -33,10 +33,9 @@ namespace KokkosBlas { /// \param x [in/out] 1-D View. /// \param y [in/out] 1-D View. /// -/// \return x and y with swapped values, note that this is akin to -/// performing a deep_copy, swapping pointers inside view -/// can only be performed if no aliasing, subviews, etc... -/// exist, which cannot be asserted by this function. +/// Swaps x and y. Note that this is akin to performing a deep_copy, swapping +/// pointers inside view can only be performed if no aliasing, subviews, etc... +/// exist, which cannot be asserted by this function. /// /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking @@ -107,8 +106,6 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { /// \param x [in/out] 1-D View. /// \param y [in/out] 1-D View. /// -/// \return x and y with swapped values. -/// /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking. Note that the kernel will be /// executed on the default stream of the execution_space associted with x. diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index a8ebf02ca3..614b48d47a 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -49,14 +49,14 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template -void gemv(const execution_space& space, const char trans[], +void gemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::gemv: execution_space must be a valid Kokkos " + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::gemv: ExecutionSpace must be a valid Kokkos " "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: AViewType must be a Kokkos::View."); @@ -71,25 +71,17 @@ void gemv(const execution_space& space, const char trans[], static_assert(static_cast(YViewType::rank) == 1, "KokkosBlas::gemv: YViewType must have rank 1."); static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: AViewType must be accessible from execution_space"); + "KokkosBlas::gemv: AViewType must be accessible from ExecutionSpace"); static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: XViewType must be accessible from execution_space"); + "KokkosBlas::gemv: XViewType must be accessible from ExecutionSpace"); static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: YViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemv: AViewType must be assignable to YViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemv: XViewType must be assignable to YViewType"); + "KokkosBlas::gemv: YViewType must be accessible from ExecutionSpace"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { @@ -155,11 +147,11 @@ void gemv(const execution_space& space, const char trans[], #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS useFallback = - useFallback || (tolower(*trans) == 'c' && - std::is_same::value && - std::is_same::value); + useFallback || + (tolower(*trans) == 'c' && + std::is_same::value && + std::is_same::value); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS useFallback = useFallback || (tolower(*trans) == 'c' && @@ -168,13 +160,24 @@ void gemv(const execution_space& space, const char trans[], std::is_same::value); #endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#ifdef KOKKOS_ENABLE_SYCL + // oneMKL supports both row-major and column-major of A + useFallback = + useFallback || !std::is_same_v; +#endif +#endif + if (useFallback) { const bool eti_spec_avail = - KokkosBlas::Impl::gemv_eti_spec_avail::value; - typedef Impl::GEMV fallback_impl_type; + KokkosBlas::Impl::gemv_eti_spec_avail::value; + typedef Impl::GEMV + fallback_impl_type; fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y); } else { - typedef Impl::GEMV impl_type; + typedef Impl::GEMV impl_type; impl_type::gemv(space, trans, alpha, A, x, beta, y); } } diff --git a/blas/src/KokkosBlas2_syr.hpp b/blas/src/KokkosBlas2_syr.hpp new file mode 100644 index 0000000000..af66767ab4 --- /dev/null +++ b/blas/src/KokkosBlas2_syr.hpp @@ -0,0 +1,189 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_HPP_ +#define KOKKOSBLAS2_SYR_HPP_ + +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// symmetric/Hermitian matrix: A = A + alpha * x * x^{T,H}. +/// +/// Important note 1: this routine encapsulates the syr() and her() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). Therefore, in +/// Hermitian cases, the parameter alpha must be real. +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian, and +/// even if a complex alpha is supplied in Hermitian cases. Moreover, +/// this routine will always compute either the lower portion or the +/// upper portion (per user's request) of the final matrix A. So, in +/// order to obtain meaningful results, the user must make sure to +/// follow the conditions specified in the "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from XViewType"); + + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, + "XViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if ((A.extent(0) != x.extent(0)) || (A.extent(1) != x.extent(0))) { + std::ostringstream os; + os << "KokkosBlas::syr: Dimensions of A, x: " + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas2::syr(): invalid trans[0] = '" << trans[0] + << "'. It must be equal to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || + (uplo[0] == 'l')) { + // Ok + } else { + std::ostringstream oss; + oss << "KokkosBlas2::syr(): invalid uplo[0] = " << uplo[0] + << "'. It must be equal to 'U' or 'u' or 'L' or 'l'"; + throw std::runtime_error(oss.str()); + } + + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { + return; + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::SYR instantiations, by standardizing + // on particular View specializations for its template parameters. + using XVT = + Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits >; + + using AVT = Kokkos::View >; + + Impl::SYR::syr(space, trans, uplo, alpha, x, A); +} + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// symmetric/Hermitian matrix: A = A + alpha * x * x^{T,H}. +/// +/// Important note 1: this routine encapsulates the syr() and her() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). Therefore, in +/// Hermitian cases, the parameter alpha must be real. +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian, and +/// even if a complex alpha is supplied in Hermitian cases. Moreover, +/// this routine will always compute either the lower portion or the +/// upper portion (per user's request) of the final matrix A. So, in +/// order to obtain meaningful results, the user must make sure to +/// follow the conditions specified in the "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr(const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + syr( + space, trans, uplo, alpha, x, A); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR_HPP_ diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index 0cb00c8493..febd39b149 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -142,14 +142,6 @@ void gemm(const execution_space& space, const char transA[], Kokkos::SpaceAccessibility::accessible, "KokkosBlas::gemm: CViewType must be accessible from execution_space"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemm: CViewType must be assignable by AViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemm: CViewType must be assignable by BViewType"); // Check validity of transpose argument bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || diff --git a/blas/src/KokkosBlas_gesv.hpp b/blas/src/KokkosBlas_gesv.hpp index 89b9d36c96..1326c6fb8e 100644 --- a/blas/src/KokkosBlas_gesv.hpp +++ b/blas/src/KokkosBlas_gesv.hpp @@ -25,10 +25,7 @@ #ifndef KOKKOSBLAS_GESV_HPP_ #define KOKKOSBLAS_GESV_HPP_ -#include - -#include "KokkosBlas_gesv_spec.hpp" -#include "KokkosKernels_Error.hpp" +#include "KokkosLapack_gesv.hpp" namespace KokkosBlas { @@ -49,100 +46,8 @@ namespace KokkosBlas { /// its data pointer is NULL, pivoting is not used. /// template -void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - // NOTE: Currently, KokkosBlas::gesv only supports for MAGMA TPL and BLAS TPL. - // MAGMA TPL should be enabled to call the MAGMA GPU interface for - // device views BLAS TPL should be enabled to call the BLAS interface - // for host views - - static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: A must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: B must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::gesv: IPIV must be a Kokkos::View."); - static_assert(static_cast(AMatrix::rank) == 2, - "KokkosBlas::gesv: A must have rank 2."); - static_assert( - static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, - "KokkosBlas::gesv: B must have either rank 1 or rank 2."); - static_assert(static_cast(IPIVV::rank) == 1, - "KokkosBlas::gesv: IPIV must have rank 1."); - - int64_t IPIV0 = IPIV.extent(0); - int64_t A0 = A.extent(0); - int64_t A1 = A.extent(1); - int64_t B0 = B.extent(0); - - // Check validity of pivot argument - bool valid_pivot = - (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); - if (!(valid_pivot)) { - std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "Valid options include zero-extent 1-D view (no pivoting), or 1-D " - "View with size of " - << A0 << " (partial pivoting)."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - // Check for no pivoting case. Only MAGMA supports no pivoting interface -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL - if ((!std::is_same::value) && - (IPIV0 == 0) && (IPIV.data() == nullptr)) { - std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "BLAS TPL does not support no pivoting."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } -#endif -#else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL - if ((IPIV0 == 0) && (IPIV.data() == nullptr)) { - std::ostringstream os; - os << "KokkosBlas::gesv: IPIV: " << IPIV0 << ". " - << "BLAS TPL does not support no pivoting."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } -#endif -#endif - - // Check compatibility of dimensions at run time. - if ((A0 < A1) || (A0 != B0)) { - std::ostringstream os; - os << "KokkosBlas::gesv: Dimensions of A, and B do not match: " - << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) - << " x " << B.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - typedef Kokkos::View< - typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, - typename AMatrix::device_type, Kokkos::MemoryTraits > - AMatrix_Internal; - typedef Kokkos::View > - BXMV_Internal; - typedef Kokkos::View< - typename IPIVV::non_const_value_type*, typename IPIVV::array_layout, - typename IPIVV::device_type, Kokkos::MemoryTraits > - IPIVV_Internal; - AMatrix_Internal A_i = A; - // BXMV_Internal B_i = B; - IPIVV_Internal IPIV_i = IPIV; - - if (BXMV::rank == 1) { - auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosBlas::Impl::GESV::gesv(A_i, B_i, IPIV_i); - } else { // BXMV::rank == 2 - auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosBlas::Impl::GESV::gesv(A_i, B_i, IPIV_i); - } +[[deprecated]] void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + KokkosLapack::gesv(A, B, IPIV); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index b1a34f0483..d9771e3a16 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -18,12 +18,7 @@ /// \file KokkosBlas_trtri.hpp -#include "KokkosKernels_Macros.hpp" -#include "KokkosBlas_trtri_spec.hpp" -#include "KokkosKernels_helpers.hpp" -#include -#include -#include "KokkosKernels_Error.hpp" +#include "KokkosLapack_trtri.hpp" namespace KokkosBlas { @@ -48,70 +43,9 @@ namespace KokkosBlas { // and the inversion could not be completed. // source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri template -int trtri(const char uplo[], const char diag[], const AViewType& A) { - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); - - // Check validity of indicator argument - bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || - (uplo[0] == 'l'); - bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || - (diag[0] == 'n'); - - if (!valid_uplo) { - std::ostringstream os; - os << "KokkosBlas::trtri: uplo = '" << uplo[0] << "'. " - << "Valid values include 'U' or 'u' (A is upper triangular), " - "'L' or 'l' (A is lower triangular)."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - if (!valid_diag) { - std::ostringstream os; - os << "KokkosBlas::trtri: diag = '" << diag[0] << "'. " - << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be " - "unit), " - "'N' or 'n' (the diagonal of A is assumed to be non-unit)."; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - int64_t A_m = A.extent(0); - int64_t A_n = A.extent(1); - - // Return if degenerated matrices are provided - if (A_m == 0 || A_n == 0) - return 0; // This is success as the inverse of a matrix with no elements is - // itself. - - // Ensure that the dimensions of A match and that we can legally perform A*B - // or B*A - if (A_m != A_n) { - std::ostringstream os; - os << "KokkosBlas::trtri: Dimensions of A do not match," - << " A: " << A.extent(0) << " x " << A.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - // Create A matrix view type alias - using AViewInternalType = - Kokkos::View >; - - // This is the return value type and should always reside on host - using RViewInternalType = - Kokkos::View >; - - int result; - RViewInternalType R = RViewInternalType(&result); - - KokkosBlas::Impl::TRTRI::trtri(R, uplo, - diag, A); - - return result; +[[deprecated]] int trtri(const char uplo[], const char diag[], + const AViewType& A) { + return KokkosLapack::trtri(uplo, diag, A); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index a58c90d8e9..de930f6107 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -57,33 +57,42 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ template <> \ struct nrm2_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View< \ - typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ - LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + KOKKOSBLAS1_NRM2_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +#endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) +KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 5e017cb7e1..736523aa8d 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -214,214 +214,231 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, #endif -// cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS #include namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDnrm2(s.handle, N, X.data(), int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ + 1, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSnrm2(s.handle, N, X.data(), int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasScnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ + Kokkos::Cuda, Kokkos::CudaSpace, cublasDznrm2, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDznrm2( \ - s.handle, N, reinterpret_cast(X.data()), \ - int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(true) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(false) -#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, \ - const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScnrm2( \ - s.handle, N, reinterpret_cast(X.data()), \ - int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2( \ - space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +} // namespace Impl +} // namespace KokkosBlas + +#endif -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include + +namespace KokkosBlas { +namespace Impl { -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ROCBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), \ + 1, &R())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_snrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_dnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_scnrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_dznrm2, ETI_SPEC_AVAIL) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(true) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) } // namespace Impl } // namespace KokkosBlas #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const std::int64_t N = static_cast(numElems); \ + TPL_NRM2(space.sycl_queue(), N, \ + reinterpret_cast(X.data()), 1, &R()); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::nrm2, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(true) +KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ONEMKL_EXT(false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL && KOKKOS_ENABLE_SYCL + #endif diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index f203be944f..0820badd9a 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemv_tpl_spec_avail { enum : bool { value = false }; }; @@ -32,6 +32,7 @@ struct gemv_tpl_spec_avail { LAYOUTY, MEMSPACE) \ template \ struct gemv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -78,6 +79,7 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, LAYOUTY, MEMSPACE) \ template \ struct gemv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -125,22 +127,20 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template <> \ - struct gemv_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) @@ -158,6 +158,49 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) #endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) + +#define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ + template \ + struct gemv_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutLeft) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutLeft) + +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(double, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(float, Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutRight) +KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(Kokkos::complex, + Kokkos::LayoutRight) + +#endif + +#endif + } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 95e589bbf0..2ace065808 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -43,51 +43,52 @@ namespace Impl { transa = 'C'; \ } -#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ - one, beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ + one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -111,8 +112,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -128,7 +128,8 @@ namespace Impl { #define KOKKOSBLAS2_ZGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMV**, LAYOUTA, \ + struct GEMV**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUTX, \ @@ -152,8 +153,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -171,50 +171,50 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& /* space */, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& /* space */, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv( \ + transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, \ + beta_val, reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, @@ -288,6 +288,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -311,8 +312,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -335,6 +335,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct GEMV< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -358,8 +359,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -381,7 +381,8 @@ namespace Impl { #define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMV**, LAYOUTA, \ + struct GEMV**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUTX, \ @@ -405,8 +406,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -433,7 +433,8 @@ namespace Impl { #define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMV**, LAYOUTA, \ + struct GEMV**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUTX, \ @@ -457,8 +458,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -548,242 +548,327 @@ namespace Impl { transa = rocblas_operation_conjugate_transpose; \ } -#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ + X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ - X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ + X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GEMV< \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ + s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ + s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +} // namespace Impl +} // namespace KokkosBlas +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +// ONEMKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ + defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return oneapi::mkl::transpose::nontrans; + case 'T': return oneapi::mkl::transpose::trans; + case 'C': return oneapi::mkl::transpose::conjtrans; + default:; + } + throw std::invalid_argument( + "Invalid mode for oneMKL (should be one of N, T, C)"); +} + +template +struct kokkos_to_std_type_map { + using type = T; +}; + +// e.g., map Kokkos::complex to std::complex +template +struct kokkos_to_std_type_map { + using type = std::complex::mag_type>; +}; + +#define KOKKOSBLAS2_GEMV_ONEMKL(SCALAR, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ struct GEMV< \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + ExecSpace, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ + using device_type = Kokkos::Device; \ + using mem_traits = Kokkos::MemoryTraits; \ + using AViewType = \ + Kokkos::View; \ + using XViewType = \ + Kokkos::View; \ + using YViewType = Kokkos::View; \ \ - static void gemv(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void gemv(const ExecSpace& exec, const char kk_trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ - s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + bool row_major = std::is_same::value; \ + const std::int64_t M = A.extent(0); \ + const std::int64_t N = A.extent(1); \ + oneapi::mkl::transpose trans = mode_kk_to_onemkl(kk_trans[0]); \ + const std::int64_t LDA = row_major ? A.stride(0) : A.stride(1); \ + std::string label = "KokkosBlas::gemv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + \ + Kokkos::Profiling::pushRegion(label); \ + using mag_type = kokkos_to_std_type_map< \ + SCALAR, Kokkos::ArithTraits::is_complex>::type; \ + const mag_type* a = reinterpret_cast(A.data()); \ + const mag_type* x = reinterpret_cast(X.data()); \ + mag_type* y = reinterpret_cast(Y.data()); \ + if (row_major) { \ + oneapi::mkl::blas::row_major::gemv(exec.sycl_queue(), trans, M, N, \ + alpha, a, LDA, x, 1, beta, y, 1); \ + } else { \ + oneapi::mkl::blas::column_major::gemv( \ + exec.sycl_queue(), trans, M, N, alpha, a, LDA, x, 1, beta, y, 1); \ + } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_SGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_ZGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, - false) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - true) -KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, - false) - +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(float, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(double, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) +KOKKOSBLAS2_GEMV_ONEMKL(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Experimental::SYCLDeviceUSMSpace, true) } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#endif #endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index b672c690d5..3013689f34 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct ger_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp new file mode 100644 index 0000000000..f537b3854a --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_avail.hpp @@ -0,0 +1,192 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::OpenMP, Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp new file mode 100644 index 0000000000..1480bb1655 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp new file mode 100644 index 0000000000..6b64fce2bc --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_blas.hpp @@ -0,0 +1,285 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ + LDA); \ + } else { \ + /* blasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr(uplo[0], N, alpha, X.data(), one, A.data(), \ + LDA); \ + } else { \ + /* blasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr() => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::zher( \ + uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr() => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + HostBlas>::cher( \ + uplo[0], N, alpha.real(), \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher() + [~A_ll or ~real alpha] => call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) +#endif + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp new file mode 100644 index 0000000000..dad3c93dbc --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -0,0 +1,332 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? CUBLAS_FILL_MODE_LOWER \ + : CUBLAS_FILL_MODE_UPPER; + +#define KOKKOSBLAS2_DSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr( \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr( \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher( \ + s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp new file mode 100644 index 0000000000..cf02e9e207 --- /dev/null +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,312 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_SYR_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ + : rocblas_fill_upper; + +#define KOKKOSBLAS2_DSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr( \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr( \ + s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher( \ + s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher( \ + s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr( \ + space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 69146baf4f..8e96898b10 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -164,26 +164,22 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, }; KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutRight, Kokkos::HIPSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutRight, Kokkos::HIPSpace) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 37733f609e..6b158f4d19 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -236,19 +236,42 @@ void F77_BLAS_MANGLE(cgeru, CGERU)(int*, int*, const std::complex*, const std::complex*, int*, const std::complex*, int*, std::complex*, int*); -void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); void F77_BLAS_MANGLE(zgeru, ZGERU)(int*, int*, const std::complex*, const std::complex*, int*, const std::complex*, int*, std::complex*, int*); +void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, const std::complex*, int*, const std::complex*, int*, std::complex*, int*); +/// +/// Syr +/// +void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, int*, const float*, const float*, + int*, float*, int*); +void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, int*, const double*, + const double*, int*, double*, int*); +// Although there is a cgeru, there is no csyru +// Although there is a zgeru, there is no zsyru +// Although there is a cgerc, there is no csyrc, but there is cher (see below) +// Although there is a zgerc, there is no zsyrc, but there is zher (see below) + +/// +/// Her +/// + +void F77_BLAS_MANGLE(cher, CHER)(const char*, int*, const float*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zher, ZHER)(const char*, int*, const double*, + const std::complex*, int*, + std::complex*, int*); + /// /// Trsv /// @@ -359,39 +382,6 @@ void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, const std::complex*, const std::complex*, int*, /* */ std::complex*, int*); - -/// -/// Gesv -/// - -void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, - int*); -void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, - int*, int*); -void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, - std::complex*, int*, int*); -void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, - int*, std::complex*, int*, int*); - -/// -/// Trtri -/// -/* - HostBlas::trtri(const char uplo, const char diag, - int n, const float *a, int lda) { - int info = 0; - F77_FUNC_STRTRI(&uplo, - &diag, &n, - a, &lda, &info); -*/ -void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, - const float*, int*, int*); -void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, - const double*, int*, int*); -void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); -void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, - const std::complex*, int*, int*); } void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, @@ -466,10 +456,16 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_SGER F77_BLAS_MANGLE(sger, SGER) #define F77_FUNC_DGER F77_BLAS_MANGLE(dger, DGER) #define F77_FUNC_CGERU F77_BLAS_MANGLE(cgeru, CGERU) -#define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) #define F77_FUNC_ZGERU F77_BLAS_MANGLE(zgeru, ZGERU) +#define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) #define F77_FUNC_ZGERC F77_BLAS_MANGLE(zgerc, ZGERC) +#define F77_FUNC_SSYR F77_BLAS_MANGLE(ssyr, SSYR) +#define F77_FUNC_DSYR F77_BLAS_MANGLE(dsyr, DSYR) + +#define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) +#define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -495,16 +491,6 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CTRSM F77_BLAS_MANGLE(ctrsm, CTRSM) #define F77_FUNC_ZTRSM F77_BLAS_MANGLE(ztrsm, ZTRSM) -#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) -#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV) -#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) -#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) - -#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) -#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) -#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) -#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI) - namespace KokkosBlas { namespace Impl { @@ -577,6 +563,11 @@ void HostBlas::ger(int m, int n, const float alpha, const float* x, F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> +void HostBlas::syr(const char uplo, int n, const float alpha, + const float* x, int incx, float* a, int lda) { + F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const float* a, int lda, /* */ float* b, int ldb) { @@ -613,18 +604,6 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } -template <> -void HostBlas::gesv(int n, int rhs, float* a, int lda, int* ipiv, - float* b, int ldb, int info) { - F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas::trtri(const char uplo, const char diag, int n, - const float* a, int lda) { - int info = 0; - F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// double @@ -696,6 +675,11 @@ void HostBlas::ger(int m, int n, const double alpha, const double* x, F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> +void HostBlas::syr(const char uplo, int n, const double alpha, + const double* x, int incx, double* a, int lda) { + F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const double* a, int lda, /* */ double* b, int ldb) { @@ -732,18 +716,6 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } -template <> -void HostBlas::gesv(int n, int rhs, double* a, int lda, int* ipiv, - double* b, int ldb, int info) { - F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas::trtri(const char uplo, const char diag, int n, - const double* a, int lda) { - int info = 0; - F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// std::complex @@ -842,6 +814,14 @@ void HostBlas >::gerc( &lda); } template <> +template <> +void HostBlas >::cher( + const char uplo, int n, const float alpha, const std::complex* x, + int incx, std::complex* a, int lda) { + F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (std::complex*)a, &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, int lda, @@ -897,21 +877,6 @@ void HostBlas >::trsm(const char side, const char uplo, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } -template <> -void HostBlas >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { - F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { - int info = 0; - F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} /// /// std::complex @@ -1012,6 +977,14 @@ void HostBlas >::gerc( (std::complex*)a, &lda); } template <> +template <> +void HostBlas >::zher( + const char uplo, int n, const double alpha, const std::complex* x, + int incx, std::complex* a, int lda) { + F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (std::complex*)a, &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, @@ -1063,21 +1036,6 @@ void HostBlas >::trsm( (const std::complex*)a, &lda, (std::complex*)b, &ldb); } -template <> -void HostBlas >::gesv(int n, int rhs, - std::complex* a, int lda, - int* ipiv, std::complex* b, - int ldb, int info) { - F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); -} -template <> -int HostBlas >::trtri(const char uplo, const char diag, - int n, const std::complex* a, - int lda) { - int info = 0; - F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); - return info; -} } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index cd53537ea6..06a5620155 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -73,6 +73,17 @@ struct HostBlas { static void gerc(int m, int n, const T alpha, const T *x, int incx, const T *y, int incy, T *a, int lda); + static void syr(const char uplo, int n, const T alpha, const T *x, int incx, + T *a, int lda); + + template + static void cher(const char uplo, int n, const tAlpha alpha, const T *x, + int incx, T *a, int lda); + + template + static void zher(const char uplo, int n, const tAlpha alpha, const T *x, + int incx, T *a, int lda); + static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, /* */ T *b, int ldb); @@ -95,12 +106,6 @@ struct HostBlas { const char diag, int m, int n, const T alpha, const T *a, int lda, /* */ T *b, int ldb); - - static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, - int info); - - static int trtri(const char uplo, const char diag, int n, const T *a, - int lda); }; } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp deleted file mode 100644 index de9fc08c99..0000000000 --- a/blas/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ /dev/null @@ -1,107 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ - -namespace KokkosBlas { -namespace Impl { - -// Specialization struct which defines whether a specialization exists -template -struct trtri_tpl_spec_avail { - enum : bool { value = false }; -}; - -// Generic Host side LAPACK (could be MKL or whatever) -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ - template \ - struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ - }; - -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, MEMSPACE) \ - KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) -#else -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTA, MEMSPACE) -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS - -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) \ - KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) -#else -#define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA - -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) - -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) - -} // namespace Impl -} // namespace KokkosBlas - -#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index ff955d13a8..a29c5ffd72 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -16,9 +16,6 @@ #ifndef TEST_BLAS_HPP #define TEST_BLAS_HPP -#include "Test_Blas_gesv.hpp" -#include "Test_Blas_trtri.hpp" - // Blas 1 #include "Test_Blas1_abs.hpp" #include "Test_Blas1_asum.hpp" @@ -62,6 +59,7 @@ // Blas 2 #include "Test_Blas2_gemv.hpp" #include "Test_Blas2_ger.hpp" +#include "Test_Blas2_syr.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" @@ -75,9 +73,6 @@ #include "Test_Blas3_trmm.hpp" #include "Test_Blas3_trsm.hpp" -// Stuff that should move later on -#include "Test_Blas_Newton.hpp" - // TPLs #include "Test_Blas_rocblas.hpp" diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index 8a2c7e3374..5bf3f55388 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -213,12 +213,12 @@ int test_abs_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_float"); - test_abs(); + test_abs(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_float"); - test_abs_mv(); + test_abs_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -228,12 +228,12 @@ TEST_F(TestCategory, abs_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); - test_abs(); + test_abs(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double"); - test_abs_mv(); + test_abs_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -243,13 +243,12 @@ TEST_F(TestCategory, abs_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_double"); - test_abs, Kokkos::complex, TestExecSpace>(); + test_abs, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_double"); - test_abs_mv, Kokkos::complex, - TestExecSpace>(); + test_abs_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -259,21 +258,21 @@ TEST_F(TestCategory, abs_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, abs_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_int"); - test_abs(); + test_abs(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, abs_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::abs_mv_int"); - test_abs_mv(); + test_abs_mv(); Kokkos::Profiling::popRegion(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -abs_double_int ) { test_abs (); +abs_double_int ) { test_abs (); } TEST_F( TestCategory, abs_double_mv_int ) { - test_abs_mv (); + test_abs_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index e914c9a19a..65b5b2c063 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -98,7 +98,7 @@ int test_asum() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_float"); - test_asum(); + test_asum(); Kokkos::Profiling::popRegion(); } #endif @@ -108,7 +108,7 @@ TEST_F(TestCategory, asum_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_double"); - test_asum(); + test_asum(); Kokkos::Profiling::popRegion(); } #endif @@ -118,7 +118,7 @@ TEST_F(TestCategory, asum_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_complex_double"); - test_asum, TestExecSpace>(); + test_asum, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -128,7 +128,7 @@ TEST_F(TestCategory, asum_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, asum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::asum_int"); - test_asum(); + test_asum(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 0d34464a84..8d5afb5f0b 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -227,12 +227,12 @@ int test_axpby_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_float"); - test_axpby(); + test_axpby(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_float"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -242,11 +242,11 @@ TEST_F(TestCategory, axpby_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double"); - test_axpby(); + test_axpby(); } TEST_F(TestCategory, axpby_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -256,13 +256,12 @@ TEST_F(TestCategory, axpby_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_complex_double"); - test_axpby, Kokkos::complex, TestExecSpace>(); + test_axpby, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_complex_double"); - test_axpby_mv, Kokkos::complex, - TestExecSpace>(); + test_axpby_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -272,12 +271,12 @@ TEST_F(TestCategory, axpby_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpby_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_int"); - test_axpby(); + test_axpby(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_int"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -286,12 +285,12 @@ TEST_F(TestCategory, axpby_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpby_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_double_int"); - test_axpby(); + test_axpby(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpby_double_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_double_int"); - test_axpby_mv(); + test_axpby_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 8b21ff6dc5..76528f4a52 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -224,12 +224,12 @@ int test_axpy_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_float"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_float"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -239,12 +239,12 @@ TEST_F(TestCategory, axpy_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -254,13 +254,12 @@ TEST_F(TestCategory, axpy_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_complex_double"); - test_axpy, Kokkos::complex, TestExecSpace>(); + test_axpy, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_complex_double"); - test_axpy_mv, Kokkos::complex, - TestExecSpace>(); + test_axpy_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -270,12 +269,12 @@ TEST_F(TestCategory, axpy_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, axpy_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_int"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_int"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -284,12 +283,12 @@ TEST_F(TestCategory, axpy_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, axpy_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_double_int"); - test_axpy(); + test_axpy(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, axpy_double_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpy_mv_double_int"); - test_axpy_mv(); + test_axpy_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index d978cbafaa..911925476a 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -235,12 +235,12 @@ int test_dot_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_float"); - test_dot(); + test_dot(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_float"); - test_dot_mv(); + test_dot_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -250,12 +250,12 @@ TEST_F(TestCategory, dot_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_double"); - test_dot(); + test_dot(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_double"); - test_dot_mv(); + test_dot_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -265,13 +265,12 @@ TEST_F(TestCategory, dot_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_complex_double"); - test_dot, Kokkos::complex, TestExecSpace>(); + test_dot, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_complex_double"); - test_dot_mv, Kokkos::complex, - TestExecSpace>(); + test_dot_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -281,21 +280,21 @@ TEST_F(TestCategory, dot_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, dot_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_int"); - test_dot(); + test_dot(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, dot_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::dot_mv_int"); - test_dot_mv(); + test_dot_mv(); Kokkos::Profiling::popRegion(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -dot_double_int ) { test_dot (); +dot_double_int ) { test_dot (); } TEST_F( TestCategory, dot_mv_double_int ) { - test_dot_mv (); + test_dot_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index fcd896e22a..49f759958a 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -280,12 +280,12 @@ int test_iamax_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_float"); - test_iamax(); + test_iamax(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mvfloat"); - test_iamax_mv(); + test_iamax_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -295,12 +295,12 @@ TEST_F(TestCategory, iamax_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_double"); - test_iamax(); + test_iamax(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_double"); - test_iamax_mv(); + test_iamax_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -310,12 +310,12 @@ TEST_F(TestCategory, iamax_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_complex_double"); - test_iamax, TestExecSpace>(); + test_iamax, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_complex_double"); - test_iamax_mv, TestExecSpace>(); + test_iamax_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -325,12 +325,12 @@ TEST_F(TestCategory, iamax_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, iamax_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_int"); - test_iamax(); + test_iamax(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, iamax_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::iamax_mv_int"); - test_iamax_mv(); + test_iamax_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index 0888c7a6b2..6555280f0d 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -275,12 +275,12 @@ int test_mult_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_float"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -290,12 +290,12 @@ TEST_F(TestCategory, mult_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -306,13 +306,13 @@ TEST_F(TestCategory, mult_mv_double) { TEST_F(TestCategory, mult_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_complex_double"); test_mult, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_complex_double"); test_mult_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -322,12 +322,12 @@ TEST_F(TestCategory, mult_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, mult_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_int"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_int"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -336,12 +336,12 @@ TEST_F(TestCategory, mult_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, mult_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_double_int"); - test_mult(); + test_mult(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, mult_mv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::mult_mv_double_int"); - test_mult_mv(); + test_mult_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index 5c99895a49..f6938c5147 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -182,12 +182,12 @@ int test_nrm1_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_float"); - test_nrm1(); + test_nrm1(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_float"); - test_nrm1_mv(); + test_nrm1_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -197,12 +197,12 @@ TEST_F(TestCategory, nrm1_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_double"); - test_nrm1(); + test_nrm1(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_double"); - test_nrm1_mv(); + test_nrm1_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -212,12 +212,12 @@ TEST_F(TestCategory, nrm1_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_complex_double"); - test_nrm1, TestExecSpace>(); + test_nrm1, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_complex_double"); - test_nrm1_mv, TestExecSpace>(); + test_nrm1_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -227,12 +227,12 @@ TEST_F(TestCategory, nrm1_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm1_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_int"); - test_nrm1(); + test_nrm1(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm1_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm1_mv_int"); - test_nrm1_mv(); + test_nrm1_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index 1264cfecf2..a9b3f7c10f 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -177,12 +177,12 @@ int test_nrm2_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_float"); - test_nrm2(); + test_nrm2(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_float"); - test_nrm2_mv(); + test_nrm2_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -192,12 +192,12 @@ TEST_F(TestCategory, nrm2_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_double"); - test_nrm2(); + test_nrm2(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_double"); - test_nrm2_mv(); + test_nrm2_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -207,12 +207,12 @@ TEST_F(TestCategory, nrm2_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_complex_double"); - test_nrm2, TestExecSpace>(); + test_nrm2, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_complex_double"); - test_nrm2_mv, TestExecSpace>(); + test_nrm2_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -222,12 +222,12 @@ TEST_F(TestCategory, nrm2_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_int"); - test_nrm2(); + test_nrm2(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_mv_int"); - test_nrm2_mv(); + test_nrm2_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index c218a12d39..09e4b3d45d 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -182,12 +182,12 @@ int test_nrm2_squared_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_float"); - test_nrm2_squared(); + test_nrm2_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_float"); - test_nrm2_squared_mv(); + test_nrm2_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -197,12 +197,12 @@ TEST_F(TestCategory, nrm2_squared_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_double"); - test_nrm2_squared(); + test_nrm2_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_double"); - test_nrm2_squared_mv(); + test_nrm2_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -213,13 +213,13 @@ TEST_F(TestCategory, nrm2_squared_mv_double) { TEST_F(TestCategory, nrm2_squared_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2_squared_complex_double"); - test_nrm2_squared, TestExecSpace>(); + test_nrm2_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2_squared_mv_complex_double"); - test_nrm2_squared_mv, TestExecSpace>(); + test_nrm2_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -229,12 +229,12 @@ TEST_F(TestCategory, nrm2_squared_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_int"); - test_nrm2_squared(); + test_nrm2_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2_squared_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2_squared_mv_int"); - test_nrm2_squared_mv(); + test_nrm2_squared_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index 89c1bdad45..48d8676fe4 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -185,12 +185,12 @@ int test_nrm2w_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); - test_nrm2w(); + test_nrm2w(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_float"); - test_nrm2w_mv(); + test_nrm2w_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -200,12 +200,12 @@ TEST_F(TestCategory, nrm2w_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); - test_nrm2w(); + test_nrm2w(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_double"); - test_nrm2w_mv(); + test_nrm2w_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -215,12 +215,12 @@ TEST_F(TestCategory, nrm2w_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); - test_nrm2w, TestExecSpace>(); + test_nrm2w, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_complex_double"); - test_nrm2w_mv, TestExecSpace>(); + test_nrm2w_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -230,12 +230,12 @@ TEST_F(TestCategory, nrm2w_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); - test_nrm2w(); + test_nrm2w(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_int"); - test_nrm2w_mv(); + test_nrm2w_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index bacc733b1a..5a55d15fad 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -179,12 +179,12 @@ int test_nrm2w_squared_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); - test_nrm2w_squared(); + test_nrm2w_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_float"); - test_nrm2w_squared_mv(); + test_nrm2w_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -194,12 +194,12 @@ TEST_F(TestCategory, nrm2w_squared_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); - test_nrm2w_squared(); + test_nrm2w_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_double"); - test_nrm2w_squared_mv(); + test_nrm2w_squared_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -210,13 +210,13 @@ TEST_F(TestCategory, nrm2w_squared_mv_double) { TEST_F(TestCategory, nrm2w_squared_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2w_squared_complex_double"); - test_nrm2w_squared, TestExecSpace>(); + test_nrm2w_squared, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); - test_nrm2w_squared_mv, TestExecSpace>(); + test_nrm2w_squared_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -226,12 +226,12 @@ TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrm2w_squared_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); - test_nrm2w_squared(); + test_nrm2w_squared(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrm2w_squared_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_int"); - test_nrm2w_squared_mv(); + test_nrm2w_squared_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 438db16895..91cc1c7502 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -173,12 +173,12 @@ int test_nrminf_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_float"); - test_nrminf(); + test_nrminf(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mvfloat"); - test_nrminf_mv(); + test_nrminf_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -188,12 +188,12 @@ TEST_F(TestCategory, nrminf_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_double"); - test_nrminf(); + test_nrminf(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_double"); - test_nrminf_mv(); + test_nrminf_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -203,12 +203,12 @@ TEST_F(TestCategory, nrminf_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_complex_double"); - test_nrminf, TestExecSpace>(); + test_nrminf, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_complex_double"); - test_nrminf_mv, TestExecSpace>(); + test_nrminf_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -218,12 +218,12 @@ TEST_F(TestCategory, nrminf_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, nrminf_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_int"); - test_nrminf(); + test_nrminf(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, nrminf_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrminf_mv_int"); - test_nrminf_mv(); + test_nrminf_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 841725e6fd..c293fa04eb 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -210,12 +210,12 @@ int test_reciprocal_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_float"); - test_reciprocal(); + test_reciprocal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_float"); - test_reciprocal_mv(); + test_reciprocal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -225,12 +225,12 @@ TEST_F(TestCategory, reciprocal_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_double"); - test_reciprocal(); + test_reciprocal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_double"); - test_reciprocal_mv(); + test_reciprocal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -241,14 +241,14 @@ TEST_F(TestCategory, reciprocal_mv_double) { TEST_F(TestCategory, reciprocal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_complex_double"); test_reciprocal, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_complex_double) { Kokkos::Profiling::pushRegion( "KokkosBlas::Test::reciprocal_mv_complex_double"); test_reciprocal_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -258,12 +258,12 @@ TEST_F(TestCategory, reciprocal_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, reciprocal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_int"); - test_reciprocal(); + test_reciprocal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, reciprocal_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::reciprocal_mv_int"); - test_reciprocal_mv(); + test_reciprocal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -271,10 +271,10 @@ TEST_F(TestCategory, reciprocal_mv_int) { /* #if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -reciprocal_double_int ) { test_reciprocal (); +reciprocal_double_int ) { test_reciprocal (); } TEST_F( TestCategory, reciprocal_double_mv_int ) { - test_reciprocal_mv (); + test_reciprocal_mv (); } #endif */ diff --git a/blas/unit_test/Test_Blas1_rot.hpp b/blas/unit_test/Test_Blas1_rot.hpp index 7fe079d1aa..ab1f395923 100644 --- a/blas/unit_test/Test_Blas1_rot.hpp +++ b/blas/unit_test/Test_Blas1_rot.hpp @@ -75,7 +75,7 @@ int test_rot() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot(); + test_rot(); Kokkos::Profiling::popRegion(); } #endif @@ -85,7 +85,7 @@ TEST_F(TestCategory, rot_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot(); + test_rot(); Kokkos::Profiling::popRegion(); } #endif @@ -95,7 +95,7 @@ TEST_F(TestCategory, rot_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot, TestExecSpace>(); + test_rot, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -105,7 +105,7 @@ TEST_F(TestCategory, rot_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rot_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rot"); - test_rot, TestExecSpace>(); + test_rot, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_rotg.hpp b/blas/unit_test/Test_Blas1_rotg.hpp index 338eaa1b56..27f9c3cf71 100644 --- a/blas/unit_test/Test_Blas1_rotg.hpp +++ b/blas/unit_test/Test_Blas1_rotg.hpp @@ -16,12 +16,12 @@ #include namespace Test { -template -void test_rotg_impl(ExecSpace const& space, Scalar const a_in, - Scalar const b_in) { +template +void test_rotg_impl(typename Device::execution_space const& space, + Scalar const a_in, Scalar const b_in) { using magnitude_type = typename Kokkos::ArithTraits::mag_type; - using SViewType = Kokkos::View; - using MViewType = Kokkos::View; + using SViewType = Kokkos::View; + using MViewType = Kokkos::View; // const magnitude_type eps = Kokkos::ArithTraits::eps(); // const Scalar zero = Kokkos::ArithTraits::zero(); @@ -43,17 +43,17 @@ void test_rotg_impl(ExecSpace const& space, Scalar const a_in, } } // namespace Test -template +template int test_rotg() { const Scalar zero = Kokkos::ArithTraits::zero(); const Scalar one = Kokkos::ArithTraits::one(); const Scalar two = one + one; - ExecutionSpace space{}; + typename Device::execution_space space{}; - Test::test_rotg_impl(space, one, zero); - Test::test_rotg_impl(space, one / two, one / two); - Test::test_rotg_impl(space, 2.1 * one, 1.3 * one); + Test::test_rotg_impl(space, one, zero); + Test::test_rotg_impl(space, one / two, one / two); + Test::test_rotg_impl(space, 2.1 * one, 1.3 * one); return 1; } @@ -63,7 +63,7 @@ int test_rotg() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg(); + test_rotg(); Kokkos::Profiling::popRegion(); } #endif @@ -73,7 +73,7 @@ TEST_F(TestCategory, rotg_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg(); + test_rotg(); Kokkos::Profiling::popRegion(); } #endif @@ -83,7 +83,7 @@ TEST_F(TestCategory, rotg_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg, TestExecSpace>(); + test_rotg, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -93,7 +93,7 @@ TEST_F(TestCategory, rotg_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotg_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotg"); - test_rotg, TestExecSpace>(); + test_rotg, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_rotm.hpp b/blas/unit_test/Test_Blas1_rotm.hpp index c9a09fd915..1f41fd06bc 100644 --- a/blas/unit_test/Test_Blas1_rotm.hpp +++ b/blas/unit_test/Test_Blas1_rotm.hpp @@ -166,7 +166,7 @@ int test_rotm() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); - test_rotm(); + test_rotm(); Kokkos::Profiling::popRegion(); } #endif @@ -176,7 +176,7 @@ TEST_F(TestCategory, rotm_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, rotm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::rotm"); - test_rotm(); + test_rotm(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_rotmg.hpp b/blas/unit_test/Test_Blas1_rotmg.hpp index f628505d97..ecfc3b6815 100644 --- a/blas/unit_test/Test_Blas1_rotmg.hpp +++ b/blas/unit_test/Test_Blas1_rotmg.hpp @@ -218,14 +218,10 @@ void set_rotmg_input_ref_vals(const int test_case, View0& d1, View0& d2, } } // namespace Test -template +template int test_rotmg() { - Kokkos::View> - d1("d1"), d2("d2"), x1("x1"), y1("y1"); - Kokkos::View> - param("param"); + Kokkos::View d1("d1"), d2("d2"), x1("x1"), y1("y1"); + Kokkos::View param("param"); Kokkos::View ref_vals( "reference values"); diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 6c4f7b7f2a..a88ed646f1 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -229,12 +229,12 @@ int test_scal_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_float"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_float"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -244,12 +244,12 @@ TEST_F(TestCategory, scal_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -259,13 +259,12 @@ TEST_F(TestCategory, scal_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_complex_double"); - test_scal, Kokkos::complex, TestExecSpace>(); + test_scal, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_complex_double"); - test_scal_mv, Kokkos::complex, - TestExecSpace>(); + test_scal_mv, Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -275,12 +274,12 @@ TEST_F(TestCategory, scal_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, scal_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_int"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_int"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -289,12 +288,12 @@ TEST_F(TestCategory, scal_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, scal_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_double_int"); - test_scal(); + test_scal(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, scal_mv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::scal_mv_double_int"); - test_scal_mv(); + test_scal_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index 80a0561d60..cfbe4d602d 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -87,7 +87,8 @@ struct Functor_TestBlasSerialMatUtil { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::RangePolicy + policy(0, _a.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return 0; @@ -180,19 +181,19 @@ int test_blas_matutil() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, blas_scalar_serial_set_float_float) { - test_blas_matutil(); + test_blas_matutil(); } TEST_F(TestCategory, blas_scalar_serial_scale_float_float) { - test_blas_matutil(); + test_blas_matutil(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, blas_scalar_serial_set_double_double) { - test_blas_matutil(); + test_blas_matutil(); } TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { - test_blas_matutil(); + test_blas_matutil(); } #endif @@ -200,19 +201,19 @@ TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) { - test_blas_matutil, + test_blas_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) { - test_blas_matutil, + test_blas_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) { - test_blas_matutil, double, + test_blas_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) { - test_blas_matutil, double, + test_blas_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index cf119cbd00..34d52a7e4a 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -167,12 +167,12 @@ int test_sum_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_float"); - test_sum(); + test_sum(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_float"); - test_sum_mv(); + test_sum_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -182,12 +182,12 @@ TEST_F(TestCategory, sum_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_double"); - test_sum(); + test_sum(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_double"); - test_sum_mv(); + test_sum_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -197,12 +197,12 @@ TEST_F(TestCategory, sum_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_complex_double"); - test_sum, TestExecSpace>(); + test_sum, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_complex_double"); - test_sum_mv, TestExecSpace>(); + test_sum_mv, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -212,12 +212,12 @@ TEST_F(TestCategory, sum_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, sum_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_int"); - test_sum(); + test_sum(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, sum_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::sum_mv_int"); - test_sum_mv(); + test_sum_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_swap.hpp b/blas/unit_test/Test_Blas1_swap.hpp index a7e4fff433..382c35947b 100644 --- a/blas/unit_test/Test_Blas1_swap.hpp +++ b/blas/unit_test/Test_Blas1_swap.hpp @@ -60,7 +60,7 @@ int test_swap() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_float"); - test_swap(); + test_swap(); Kokkos::Profiling::popRegion(); } #endif @@ -70,7 +70,7 @@ TEST_F(TestCategory, swap_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_double"); - test_swap(); + test_swap(); Kokkos::Profiling::popRegion(); } #endif @@ -80,7 +80,7 @@ TEST_F(TestCategory, swap_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_float"); - test_swap, TestExecSpace>(); + test_swap, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -90,7 +90,7 @@ TEST_F(TestCategory, swap_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, swap_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::swap_complex_double"); - test_swap, TestExecSpace>(); + test_swap, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index d3f4f661d0..eca7657b55 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_abs(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -109,7 +110,8 @@ void impl_test_team_abs(int N) { template void impl_test_team_abs_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -122,8 +124,7 @@ void impl_test_team_abs_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); @@ -288,10 +289,10 @@ int test_team_abs_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_float) { - test_team_abs(); + test_team_abs(); } TEST_F(TestCategory, team_abs_mv_float) { - test_team_abs_mv(); + test_team_abs_mv(); } #endif @@ -299,10 +300,10 @@ TEST_F(TestCategory, team_abs_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_double) { - test_team_abs(); + test_team_abs(); } TEST_F(TestCategory, team_abs_mv_double) { - test_team_abs_mv(); + test_team_abs_mv(); } #endif @@ -310,30 +311,29 @@ TEST_F(TestCategory, team_abs_mv_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_abs_complex_double) { - test_team_abs, Kokkos::complex, - TestExecSpace>(); + test_team_abs, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_abs_mv_complex_double) { test_team_abs_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_abs_int) { test_team_abs(); } +TEST_F(TestCategory, team_abs_int) { test_team_abs(); } TEST_F(TestCategory, team_abs_mv_int) { - test_team_abs_mv(); + test_team_abs_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -team_abs_double_int ) { test_team_abs (); +team_abs_double_int ) { test_team_abs (); } TEST_F( TestCategory, team_abs_double_mv_int ) { - test_team_abs_mv (); + test_team_abs_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index e11b1e14a5..5875f2bc1f 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_axpby(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -48,8 +49,7 @@ void impl_test_team_axpby(int N) { view_stride_adapter y("Y", N); view_stride_adapter org_y("Y", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -116,7 +116,8 @@ void impl_test_team_axpby(int N) { template void impl_test_team_axpby_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -129,8 +130,7 @@ void impl_test_team_axpby_mv(int N, int K) { view_stride_adapter y("Y", N, K); view_stride_adapter org_y("Org_Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -291,10 +291,10 @@ int test_team_axpby_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_float) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_mv_float) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif @@ -302,10 +302,10 @@ TEST_F(TestCategory, team_axpby_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_double) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_mv_double) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif @@ -314,11 +314,11 @@ TEST_F(TestCategory, team_axpby_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_complex_double) { test_team_axpby, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } TEST_F(TestCategory, team_axpby_mv_complex_double) { test_team_axpby_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif @@ -326,20 +326,20 @@ TEST_F(TestCategory, team_axpby_mv_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpby_int) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_mv_int) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_axpby_double_int) { - test_team_axpby(); + test_team_axpby(); } TEST_F(TestCategory, team_axpby_double_mv_int) { - test_team_axpby_mv(); + test_team_axpby_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index 5cff9d025e..a5ac6a9c66 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_axpy(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -113,7 +114,8 @@ void impl_test_team_axpy(int N) { template void impl_test_team_axpy_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -126,8 +128,7 @@ void impl_test_team_axpy_mv(int N, int K) { view_stride_adapter y("Y", N, K); view_stride_adapter org_y("Org_Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -284,10 +285,10 @@ int test_team_axpy_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_float) { - test_team_axpy(); + test_team_axpy(); } TEST_F(TestCategory, team_axpy_mv_float) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif @@ -295,10 +296,10 @@ TEST_F(TestCategory, team_axpy_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_double) { - test_team_axpy(); + test_team_axpy(); } TEST_F(TestCategory, team_axpy_mv_double) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif @@ -307,32 +308,30 @@ TEST_F(TestCategory, team_axpy_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_axpy_complex_double) { test_team_axpy, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } TEST_F(TestCategory, team_axpy_mv_complex_double) { test_team_axpy_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_axpy_int) { - test_team_axpy(); -} +TEST_F(TestCategory, team_axpy_int) { test_team_axpy(); } TEST_F(TestCategory, team_axpy_mv_int) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_axpy_double_int) { - test_team_axpy(); + test_team_axpy(); } TEST_F(TestCategory, team_axpy_double_mv_int) { - test_team_axpy_mv(); + test_team_axpy_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index 00c0940023..26baf261fe 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -28,7 +28,8 @@ namespace Test { template void impl_test_team_dot(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -42,8 +43,7 @@ void impl_test_team_dot(int N) { view_stride_adapter a("a", N); view_stride_adapter b("b", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); @@ -161,7 +161,8 @@ void impl_test_team_dot(int N) { template void impl_test_team_dot_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -173,8 +174,7 @@ void impl_test_team_dot_mv(int N, int K) { view_stride_adapter a("A", N, K); view_stride_adapter b("B", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); @@ -355,10 +355,10 @@ int test_team_dot_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_float) { - test_team_dot(); + test_team_dot(); } TEST_F(TestCategory, team_dot_mv_float) { - test_team_dot_mv(); + test_team_dot_mv(); } #endif @@ -366,10 +366,10 @@ TEST_F(TestCategory, team_dot_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_double) { - test_team_dot(); + test_team_dot(); } TEST_F(TestCategory, team_dot_mv_double) { - test_team_dot_mv(); + test_team_dot_mv(); } #endif @@ -377,30 +377,29 @@ TEST_F(TestCategory, team_dot_mv_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_dot_complex_double) { - test_team_dot, Kokkos::complex, - TestExecSpace>(); + test_team_dot, Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_dot_mv_complex_double) { test_team_dot_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_dot_int) { test_team_dot(); } +TEST_F(TestCategory, team_dot_int) { test_team_dot(); } TEST_F(TestCategory, team_dot_mv_int) { - test_team_dot_mv(); + test_team_dot_mv(); } #endif /*#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F( TestCategory, -team_dot_double_int ) { test_team_dot (); +team_dot_double_int ) { test_team_dot (); } TEST_F( TestCategory, team_dot_mv_double_int ) { - test_team_dot_mv (); + test_team_dot_mv (); } #endif*/ diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index f340ac2309..488e9ccf51 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_mult(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -50,8 +51,7 @@ void impl_test_team_mult(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -158,7 +158,8 @@ void impl_test_team_mult(int N) { template void impl_test_team_mult_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -174,8 +175,7 @@ void impl_test_team_mult_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); typename Kokkos::ArithTraits::mag_type const max_val = 10; Kokkos::fill_random(x.d_view, rand_pool, ScalarA(max_val)); @@ -366,10 +366,10 @@ int test_team_mult_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_float) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_mv_float) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif @@ -377,10 +377,10 @@ TEST_F(TestCategory, team_mult_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_double) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_mv_double) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif @@ -389,11 +389,11 @@ TEST_F(TestCategory, team_mult_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_complex_double) { test_team_mult, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_mult_mv_complex_double) { test_team_mult_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } #endif @@ -401,20 +401,20 @@ TEST_F(TestCategory, team_mult_mv_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_mult_int) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_mv_int) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_mult_double_int) { - test_team_mult(); + test_team_mult(); } TEST_F(TestCategory, team_mult_double_mv_int) { - test_team_mult_mv(); + test_team_mult_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 4bc4836782..12192032c9 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -28,7 +28,8 @@ namespace Test { template void impl_test_team_nrm2(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -39,8 +40,7 @@ void impl_test_team_nrm2(int N, int K) { view_stride_adapter a("A", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); @@ -130,31 +130,27 @@ int test_team_nrm2() { #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_float) { - test_team_nrm2(); -} +TEST_F(TestCategory, team_nrm2_float) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_double) { - test_team_nrm2(); -} +TEST_F(TestCategory, team_nrm2_double) { test_team_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_nrm2_complex_double) { - test_team_nrm2, TestExecSpace>(); + test_team_nrm2, TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2(); } +TEST_F(TestCategory, team_nrm2_int) { test_team_nrm2(); } #endif #endif // Check for lambda availability in CUDA backend diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index e0c109e1af..212b1e09e9 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_scal(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -49,8 +50,7 @@ void impl_test_team_scal(int N) { typename AT::mag_type zero = AT::abs(AT::zero()); typename AT::mag_type one = AT::abs(AT::one()); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); @@ -122,7 +122,8 @@ void impl_test_team_scal(int N) { template void impl_test_team_scal_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -135,8 +136,7 @@ void impl_test_team_scal_mv(int N, int K) { view_stride_adapter x("X", N, K); view_stride_adapter y("Y", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); Kokkos::deep_copy(x.h_base, x.d_base); @@ -358,10 +358,10 @@ int test_team_scal_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_float) { - test_team_scal(); + test_team_scal(); } TEST_F(TestCategory, team_scal_mv_float) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif @@ -369,10 +369,10 @@ TEST_F(TestCategory, team_scal_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_double) { - test_team_scal(); + test_team_scal(); } TEST_F(TestCategory, team_scal_mv_double) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif @@ -381,32 +381,30 @@ TEST_F(TestCategory, team_scal_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_scal_complex_double) { test_team_scal, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } TEST_F(TestCategory, team_scal_mv_complex_double) { test_team_scal_mv, Kokkos::complex, - TestExecSpace>(); + TestDevice>(); } #endif #if defined(KOKKOSKERNELS_INST_INT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_scal_int) { - test_team_scal(); -} +TEST_F(TestCategory, team_scal_int) { test_team_scal(); } TEST_F(TestCategory, team_scal_mv_int) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_scal_double_int) { - test_team_scal(); + test_team_scal(); } TEST_F(TestCategory, team_scal_double_mv_int) { - test_team_scal_mv(); + test_team_scal_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index ff593d3eeb..4d2499a466 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -36,6 +36,7 @@ struct NaiveTag {}; template struct Functor_TestBlasTeamMatUtil { + using execution_space = typename DeviceType::execution_space; ScalarType _alpha; ViewType _a; @@ -97,8 +98,8 @@ struct Functor_TestBlasTeamMatUtil { Kokkos::Profiling::pushRegion(name.c_str()); const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -193,19 +194,19 @@ int test_blas_team_matutil() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, blas_scalar_team_set_float_float) { - test_blas_team_matutil(); + test_blas_team_matutil(); } TEST_F(TestCategory, blas_scalar_team_scale_float_float) { - test_blas_team_matutil(); + test_blas_team_matutil(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, blas_scalar_team_set_double_double) { - test_blas_team_matutil(); + test_blas_team_matutil(); } TEST_F(TestCategory, blas_scalar_team_scale_double_double) { - test_blas_team_matutil(); + test_blas_team_matutil(); } #endif @@ -213,19 +214,19 @@ TEST_F(TestCategory, blas_scalar_team_scale_double_double) { #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) { - test_blas_team_matutil, + test_blas_team_matutil, Kokkos::complex, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) { - test_blas_team_matutil, + test_blas_team_matutil, Kokkos::complex, ::Test::BlasScale>(); } TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) { - test_blas_team_matutil, double, + test_blas_team_matutil, double, ::Test::BlasSet>(); } TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) { - test_blas_team_matutil, double, + test_blas_team_matutil, double, ::Test::BlasScale>(); } #endif diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index 09b60440ae..cfc76455f3 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -29,7 +29,8 @@ namespace Test { template void impl_test_team_update(int N) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch M teams of the maximum number of threads per team @@ -51,8 +52,7 @@ void impl_test_team_update(int N) { view_stride_adapter z("Z", N); view_stride_adapter org_z("Org_Z", N); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -160,7 +160,8 @@ void impl_test_team_update(int N) { template void impl_test_team_update_mv(int N, int K) { - typedef Kokkos::TeamPolicy team_policy; + using execution_space = typename Device::execution_space; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; // Launch K teams of the maximum number of threads per team @@ -175,8 +176,7 @@ void impl_test_team_update_mv(int N, int K) { view_stride_adapter z("Z", N, K); view_stride_adapter org_z("Org_Z", N, K); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); @@ -370,10 +370,10 @@ int test_team_update_mv() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_float) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_mv_float) { - test_team_update_mv(); + test_team_update_mv(); } #endif @@ -381,10 +381,10 @@ TEST_F(TestCategory, team_update_mv_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_double) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_mv_double) { - test_team_update_mv(); + test_team_update_mv(); } #endif @@ -393,11 +393,11 @@ TEST_F(TestCategory, team_update_mv_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_complex_double) { test_team_update, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } TEST_F(TestCategory, team_update_mv_complex_double) { test_team_update_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); } #endif @@ -405,20 +405,20 @@ TEST_F(TestCategory, team_update_mv_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, team_update_int) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_mv_int) { - test_team_update_mv(); + test_team_update_mv(); } #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, team_update_double_int) { - test_team_update(); + test_team_update(); } TEST_F(TestCategory, team_update_double_mv_int) { - test_team_update_mv(); + test_team_update_mv(); } #endif diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index 07445f595e..cfeddb9d3d 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -279,12 +279,12 @@ int test_update_mv() { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_float"); - test_update(); + test_update(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_float"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -294,11 +294,11 @@ TEST_F(TestCategory, update_mv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double"); - test_update(); + test_update(); } TEST_F(TestCategory, update_mv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -309,13 +309,13 @@ TEST_F(TestCategory, update_mv_double) { TEST_F(TestCategory, update_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_complex_double"); test_update, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_complex_double"); test_update_mv, Kokkos::complex, - Kokkos::complex, TestExecSpace>(); + Kokkos::complex, TestDevice>(); Kokkos::Profiling::popRegion(); } #endif @@ -325,12 +325,12 @@ TEST_F(TestCategory, update_mv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, update_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_int"); - test_update(); + test_update(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_int"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif @@ -339,12 +339,12 @@ TEST_F(TestCategory, update_mv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, update_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_double_int"); - test_update(); + test_update(); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, update_mv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::update_mv_double_int"); - test_update_mv(); + test_update_mv(); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index dc83ac82f5..b3f3566f83 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -21,8 +21,10 @@ #include namespace Test { -template -void impl_test_gemv(const char* mode, int M, int N) { +template +void impl_test_gemv_streams(ExecutionSpace& space, const char* mode, int M, + int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeX::value_type ScalarX; typedef typename ViewTypeY::value_type ScalarY; @@ -47,8 +49,7 @@ void impl_test_gemv(const char* mode, int M, int N) { view_stride_adapter y("Y", ldy); view_stride_adapter org_y("Org_Y", ldy); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); + Kokkos::Random_XorShift64_Pool rand_pool(13718); constexpr double max_valX = 1; constexpr double max_valY = 1; @@ -56,17 +57,17 @@ void impl_test_gemv(const char* mode, int M, int N) { { ScalarX randStart, randEnd; Test::getRandomBounds(max_valX, randStart, randEnd); - Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(space, x.d_view, rand_pool, randStart, randEnd); } { ScalarY randStart, randEnd; Test::getRandomBounds(max_valY, randStart, randEnd); - Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(space, y.d_view, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; Test::getRandomBounds(max_valA, randStart, randEnd); - Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(space, A.d_view, rand_pool, randStart, randEnd); } const typename KAT_Y::mag_type max_error = @@ -82,7 +83,7 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::deep_copy(expected, org_y.h_view); vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); - KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); int numErrors = 0; for (int i = 0; i < ldy; i++) { @@ -97,10 +98,12 @@ void impl_test_gemv(const char* mode, int M, int N) { << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y.d_base, org_y.h_base); - KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(space, y.d_base, org_y.h_base); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view_const, beta, + y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; + Kokkos::fence(); // Wait for vanillaGEMV for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } @@ -108,8 +111,9 @@ void impl_test_gemv(const char* mode, int M, int N) { << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y.d_base, org_y.h_base); - KokkosBlas::gemv(mode, alpha, A.d_view_const, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(space, y.d_base, org_y.h_base); + KokkosBlas::gemv(space, mode, alpha, A.d_view_const, x.d_view_const, beta, + y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { @@ -123,9 +127,11 @@ void impl_test_gemv(const char* mode, int M, int N) { beta = KAT_Y::zero(); // beta changed, so update the correct answer vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); - Kokkos::deep_copy(y.d_view, KAT_Y::nan()); - KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + Kokkos::deep_copy(space, y.d_view, KAT_Y::nan()); + KokkosBlas::gemv(space, mode, alpha, A.d_view, x.d_view, beta, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); + + Kokkos::fence(); // Wait for vanillaGEMV numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::isNan(y.h_view(i)) || @@ -141,6 +147,13 @@ void impl_test_gemv(const char* mode, int M, int N) { EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect"; } +template +void impl_test_gemv(const char* mode, int M, int N) { + using execution_space = typename Device::execution_space; + execution_space space; + impl_test_gemv_streams(space, mode, M, N); +} } // namespace Test template @@ -240,11 +253,11 @@ int test_gemv(const char* mode) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_float"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_float"); - test_gemv("T"); + test_gemv("T"); Kokkos::Profiling::popRegion(); } #endif @@ -254,11 +267,11 @@ TEST_F(TestCategory, gemv_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_double"); - test_gemv("T"); + test_gemv("T"); Kokkos::Profiling::popRegion(); } #endif @@ -269,17 +282,17 @@ TEST_F(TestCategory, gemv_double) { TEST_F(TestCategory, gemv_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_complex_double"); test_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("N"); + Kokkos::complex, TestDevice>("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_complex_double"); test_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("T"); + Kokkos::complex, TestDevice>("T"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); test_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("C"); + Kokkos::complex, TestDevice>("C"); Kokkos::Profiling::popRegion(); } #endif @@ -289,11 +302,11 @@ TEST_F(TestCategory, gemv_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gemv_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_int"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_tran_int"); - test_gemv("T"); + test_gemv("T"); Kokkos::Profiling::popRegion(); } #endif @@ -302,11 +315,56 @@ TEST_F(TestCategory, gemv_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, gemv_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_double_int"); - test_gemv("N"); + test_gemv("N"); Kokkos::Profiling::popRegion(); // Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemvt_double_int"); - // test_gemv ("T"); + // test_gemv ("T"); // Kokkos::Profiling::popRegion(); } #endif + +template +int test_gemv_streams(const char* mode) { + using execution_space = typename Device::execution_space; + execution_space space; +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + using view_type_c_ll = Kokkos::View; + Test::impl_test_gemv_streams(space, mode, 0, 1024); + Test::impl_test_gemv_streams(space, mode, 13, 1024); + Test::impl_test_gemv_streams(space, mode, 50, 40); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + using view_type_a_lr = Kokkos::View; + using view_type_b_lr = Kokkos::View; + using view_type_c_lr = Kokkos::View; + Test::impl_test_gemv_streams(space, mode, 0, 1024); + Test::impl_test_gemv_streams(space, mode, 13, 1024); + Test::impl_test_gemv_streams(space, mode, 50, 40); +#endif + (void)space; + return 1; +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + blas##_##gemv_streams##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gemv_streams("N"); \ + test_gemv_streams("T"); \ + } + +#define NO_TEST_COMPLEX + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST +#undef NO_TEST_COMPLEX \ No newline at end of file diff --git a/blas/unit_test/Test_Blas2_gemv_util.hpp b/blas/unit_test/Test_Blas2_gemv_util.hpp index 99b4516cff..e28310c8eb 100644 --- a/blas/unit_test/Test_Blas2_gemv_util.hpp +++ b/blas/unit_test/Test_Blas2_gemv_util.hpp @@ -23,8 +23,9 @@ namespace Test { -template ::value> +template ::value> using simd_vector = KokkosBatched::Vector, length>; @@ -239,7 +240,8 @@ struct GEMVTest { template static void run_views(const char trans, ViewTypeA A, ViewTypeX x, ViewTypeY y) { - Kokkos::TeamPolicy teams(1, 1); // just run on device + Kokkos::TeamPolicy teams( + 1, 1); // just run on device fill_inputs(A, x, y); ScalarType alpha = 3; // TODO: test also with zero alpha/beta ? ScalarType beta = 5; @@ -279,7 +281,8 @@ struct GEMVTest { ViewTypeY, Device, ScalarType>; op_type gemv_op(trans, alpha, A, x, beta, y); - Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); + Kokkos::parallel_for( + Kokkos::TeamPolicy(1, 1), gemv_op); const double eps = epsilon(ScalarY{}); EXPECT_NEAR_KK_REL_1DVIEW(y, y_ref, eps); @@ -318,7 +321,7 @@ struct GEMVTest { SCALAR_COEF) \ using PREFIX##_##NAME##_gemv_test = \ ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, \ - TestExecSpace, SCALAR_COEF>; \ + TestDevice, SCALAR_COEF>; \ TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { \ PREFIX##_##NAME##_gemv_test::run("N"); \ } \ diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 7e9ed08d88..a0860bae04 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -14,6 +14,37 @@ // //@HEADER +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operation A += alpha * x * y^{T,H}. +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'y' components: float, double, complex, ... +// 03) Type of 'A' components: float, double, complex, ... +// 04) Execution space: serial, threads, OpenMP, Cuda, ... +// 05) Layout of 'x' +// 06) Layout of 'y' +// 07) Layout of 'A' +// 08) Dimension of 'A' +// 09) Options 'const' or 'non const' for x view, when calling ger() +// 10) Options 'const' or 'non const' for y view, when calling ger() +// 11) Usage of analytical results in the tests +// 12) Options 'T' or 'H' when calling ger() +// +// Choices (01)-(04) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_ger<...>(). +// +// Choices (05)-(12) are selected in routine test_gerr<...>(), +// when calling the method test() of class Test::GerTester<...>. +// +// The class Test::GerTester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::GerTester<...>::test() +// is given by the 9 steps named "Step 1 of 9" to "Step 9 of 9" +// in the code below. +// ********************************************************************** + #include #include #include @@ -35,18 +66,18 @@ class GerTester { const bool useHermitianOption = false); private: - typedef Kokkos::View _ViewTypeX; - typedef Kokkos::View _ViewTypeY; - typedef Kokkos::View _ViewTypeA; + using _ViewTypeX = Kokkos::View; + using _ViewTypeY = Kokkos::View; + using _ViewTypeA = Kokkos::View; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeY::HostMirror _HostViewTypeY; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View - _ViewTypeExpected; + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = + Kokkos::View; - typedef Kokkos::ArithTraits _KAT_A; - typedef typename _KAT_A::mag_type _AuxType; + using _KAT_A = Kokkos::ArithTraits; + using _AuxType = typename _KAT_A::mag_type; void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, _HostViewTypeA& h_A, @@ -88,29 +119,31 @@ class GerTester { typename std::enable_if>::value || std::is_same>::value, void>::type - compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected); + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template typename std::enable_if>::value && !std::is_same>::value, void>::type - compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected); + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template typename std::enable_if>::value || std::is_same>::value, void>::type - compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected); template typename std::enable_if>::value && !std::is_same>::value, void>::type - compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected); + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected); template T shrinkAngleToZeroTwoPiRange(const T input); @@ -127,8 +160,8 @@ class GerTester { const bool _A_is_ll; const bool _testIsGpu; const bool _vanillaUsesDifferentOrderOfOps; - const _AuxType _epsAbs; - const _AuxType _epsRel; + const _AuxType _absTol; + const _AuxType _relTol; int _M; int _N; bool _useAnalyticalResults; @@ -154,8 +187,16 @@ GerTester::value ? 1.0e-6 : 1.0e-9), - _epsRel(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -177,6 +218,7 @@ void GerTesterpopulateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); @@ -262,7 +312,8 @@ void GerTestercompareVanillaExpected(alpha, h_vanilla.d_view, h_expected.d_view); + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected @@ -323,10 +374,12 @@ void GerTester typename std::enable_if>::value || std::is_same>::value, void>::type -GerTester::compareVanillaExpected(const T& alpha, - const _ViewTypeExpected& h_vanilla, - const _ViewTypeExpected& h_expected) { +GerTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -687,7 +740,7 @@ GerTester diffThreshold) { errorHappened = true; numErrorsRealAbs++; @@ -700,13 +753,14 @@ GerTester diffThreshold) { errorHappened = true; numErrorsRealRel++; } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j).real() = " << h_expected(i, j).real() << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() @@ -714,12 +768,13 @@ GerTester 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); @@ -815,22 +876,26 @@ GerTester 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1176,10 +1255,9 @@ template typename std::enable_if>::value && !std::is_same>::value, void>::type -GerTester::compareKokkosExpected(const T& alpha, - const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected) { +GerTester:: + compareKkGerAgainstExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * 1.e-3); @@ -1196,7 +1274,7 @@ GerTester diffThreshold) { errorHappened = true; numErrorsAbs++; @@ -1209,21 +1287,24 @@ GerTester diffThreshold) { errorHappened = true; numErrorsRel++; } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "ERROR, i = " << i << ", j = " << j << ": h_expected(i,j) = " << h_expected(i, j) << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif } } // for j } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll << ", alpha type = " << typeid(alpha).name() @@ -1241,6 +1322,7 @@ GerTester 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif { std::ostringstream msg; msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr @@ -1263,7 +1345,9 @@ GerTester 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "WARNING" << msg.str() << std::endl; +#endif } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } @@ -1278,22 +1362,35 @@ void GerTestercompareKokkosExpected(alpha, h_A, h_expected); + this->compareKkGerAgainstExpected(alpha, h_A, h_expected); } } } // namespace Test template +#ifdef HAVE_KOKKOSKERNELS_DEBUG int test_ger(const std::string& caseName) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); +#else + Kokkos::printf( + "+=======================================================================" + "===\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", caseName.c_str(), typeid(Device).name()); - +#else + Kokkos::printf("Starting %s, device = %s ...\n", caseName.c_str(), + typeid(Device).name()); +#endif +#else +int test_ger(const std::string& /*caseName*/) { +#endif bool xBool = std::is_same::value || std::is_same::value || std::is_same>::value || @@ -1340,12 +1451,23 @@ int test_ger(const std::string& caseName) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); - +#else + Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); +#endif +#endif if (true) { Test::GerTester @@ -1374,22 +1496,45 @@ int test_ger(const std::string& caseName) { } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", caseName.c_str()); +#else + Kokkos::printf("Finished %s for LAYOUTLEFT\n", caseName.c_str()); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); - +#else + Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); +#endif +#endif if (true) { Test::GerTester @@ -1418,21 +1563,44 @@ int test_ger(const std::string& caseName) { } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); +#else + Kokkos::printf("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif #endif #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); - +#else + Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); +#endif +#endif if (true) { Test::GerTester @@ -1458,21 +1626,44 @@ int test_ger(const std::string& caseName) { } } +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); +#else + Kokkos::printf("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif #endif #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); - +#else + Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); +#endif +#endif if (true) { Test::GerTester @@ -1493,18 +1684,41 @@ int test_ger(const std::string& caseName) { tester.test(1024, 1024, 0); } +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); +#else + Kokkos::printf("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" "---\n"); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif #endif +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); +#else + Kokkos::printf("Finished %s\n", caseName.c_str()); +#endif +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); - +#else + Kokkos::printf( + "+=======================================================================" + "===\n"); +#endif +#endif return 1; } @@ -1513,7 +1727,7 @@ int test_ger(const std::string& caseName) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); - test_ger("test case ger_float"); + test_ger("test case ger_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1524,8 +1738,7 @@ TEST_F(TestCategory, ger_float) { TEST_F(TestCategory, ger_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); test_ger, Kokkos::complex, - Kokkos::complex, TestExecSpace>( - "test case ger_complex_float"); + Kokkos::complex, TestDevice>("test case ger_complex_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1535,7 +1748,7 @@ TEST_F(TestCategory, ger_complex_float) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); - test_ger("test case ger_double"); + test_ger("test case ger_double"); Kokkos::Profiling::popRegion(); } #endif @@ -1546,8 +1759,7 @@ TEST_F(TestCategory, ger_double) { TEST_F(TestCategory, ger_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); test_ger, Kokkos::complex, - Kokkos::complex, TestExecSpace>( - "test case ger_complex_double"); + Kokkos::complex, TestDevice>("test case ger_complex_double"); Kokkos::Profiling::popRegion(); } #endif @@ -1557,7 +1769,7 @@ TEST_F(TestCategory, ger_complex_double) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); - test_ger("test case ger_int"); + test_ger("test case ger_int"); Kokkos::Profiling::popRegion(); } #endif @@ -1566,7 +1778,7 @@ TEST_F(TestCategory, ger_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, ger_double_int_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); - test_ger("test case ger_mixed_types"); + test_ger("test case ger_double_int_float"); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp new file mode 100644 index 0000000000..4396c81bb2 --- /dev/null +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -0,0 +1,1924 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operation A += alpha * x * x^{T,H}: +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'A' components: float, double, complex, ... +// 03) Execution space: serial, threads, OpenMP, Cuda, ... +// 04) Layout of 'x' +// 05) Layout of 'A' +// 06) Dimension of 'A' +// 07) Options 'const' or 'non const' for x view, when calling syr() +// 08) Usage of analytical results in the tests +// 09) Options 'T' or 'H' when calling syr() +// 10) Options 'U' or 'L' when calling syr() +// +// Choices (01)-(03) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_syr<...>(). +// +// Choices (04)-(10) are selected in routine test_syr<...>(), +// when calling the method test() of class Test::SyrTester<...>. +// +// The class Test::SyrTester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::SyrTester<...>::test() +// is given by the 7 steps named "Step 1 of 7" to "Step 7 of 7" +// in the code below. +// ********************************************************************** + +#include +#include +#include +#include +#include + +namespace Test { + +template +class SyrTester { + public: + SyrTester(); + + ~SyrTester(); + + void test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false, + const bool useUpOption = false); + + private: + using _ViewTypeX = Kokkos::View; + using _ViewTypeA = Kokkos::View; + + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = + Kokkos::View; + + using _KAT_A = Kokkos::ArithTraits; + using _AuxType = typename _KAT_A::mag_type; + + void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected, + _ViewTypeX& x, _ViewTypeA& A, + bool& expectedResultIsKnown); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation); + + template + void callKkGerAndCompareKkSyrAgainstIt( + const ScalarA& alpha, TX& x, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _HostViewTypeA& h_A_syr, const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _absTol; + const _AuxType _relTol; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _useUpOption; + bool _kkSyrShouldThrowException; + bool _kkGerShouldThrowException; +}; + +template +SyrTester::SyrTester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr) +#else + , + _vanillaUsesDifferentOrderOfOps(false) +#endif + , + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _useUpOption(false), + _kkSyrShouldThrowException(false), + _kkGerShouldThrowException(false) { +} + +template +SyrTester::~SyrTester() { + // Nothing to do +} + +template +void SyrTester::test( + const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, const bool useHermitianOption, + const bool useUpOption) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Entering SyrTester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " + << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol + << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults + << ", useHermitianOption = " << useHermitianOption + << ", useUpOption = " << useUpOption << std::endl; +#endif + // ******************************************************************** + // Step 1 of 7: declare main types and variables + // ******************************************************************** + _M = N; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + _useUpOption = useUpOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkSyrShouldThrowException = false; + + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + _kkGerShouldThrowException = !_A_is_ll; + } +#endif + + bool test_x(false); + bool test_cx(false); + if (nonConstConstCombinations == 0) { + test_x = true; + } else if (nonConstConstCombinations == 1) { + test_cx = true; + } else { + test_x = true; + test_cx = true; + } + + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); + + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * x^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + ScalarA alpha(_KAT_A::zero()); + + // ******************************************************************** + // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A + // ******************************************************************** + this->populateVariables(alpha, x.h_view, A.h_view, h_expected.d_view, + x.d_view, A.d_view, expectedResultIsKnown); + + // ******************************************************************** + // Step 3 of 7: populate h_vanilla + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * x^{t,h}", _M, _N); +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); +#else + Kokkos::printf( + "In Test_Blas2_syr.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); +#endif +#endif + this->populateVanillaValues(alpha, x.h_view, A.h_view, h_vanilla.d_view); + + // ******************************************************************** + // Step 4 of 7: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); + } else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); + } + + // ******************************************************************** + // Step 5 of 7: test with 'non const x' + // ******************************************************************** + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A.d_base, A.d_base); + Kokkos::deep_copy(org_A.h_view, A.h_view); + + if (test_x) { + this->callKkSyrAndCompareAgainstExpected( + alpha, x.d_view, A.d_view, A.h_view, h_expected.d_view, "non const x"); + + if ((_useAnalyticalResults == false) && // Just to save run time + (_kkGerShouldThrowException == false)) { + this->callKkGerAndCompareKkSyrAgainstIt(alpha, x.d_view, org_A, A.h_view, + "non const x"); + } + } + + // ******************************************************************** + // Step 6 of 7: test with const x + // ******************************************************************** + if (test_cx) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A.d_view, + A.h_view, h_expected.d_view, + "const x"); + } + + // ******************************************************************** + // Step 7 of 7: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW(KokkosBlas::syr(".", "U", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW(KokkosBlas::syr("", "U", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for mode ''"; + EXPECT_ANY_THROW(KokkosBlas::syr("T", ".", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW(KokkosBlas::syr("T", "", alpha, x.d_view, A.d_view)) + << "Failed test: kk syr should have thrown an exception for uplo ''"; + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Leaving SyrTester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; +#endif +} + +template +void SyrTester::populateVariables( + ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, _ViewTypeX& x, _ViewTypeA& A, + bool& expectedResultIsKnown) { + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues(alpha, h_x, h_A, h_expected); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + expectedResultIsKnown = true; + } else if (_N == 1) { + alpha = 3; + + h_x[0] = 2; + + h_A(0, 0) = 7; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + h_expected(0, 0) = 19; + expectedResultIsKnown = true; + } else if (_N == 2) { + alpha = 3; + + h_x[0] = -2; + h_x[1] = 9; + + h_A(0, 0) = 17; + h_A(0, 1) = -43; + h_A(1, 0) = -43; + h_A(1, 1) = 101; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + if (_useUpOption) { + h_expected(0, 0) = 29; + h_expected(0, 1) = -97; + h_expected(1, 0) = -43; + h_expected(1, 1) = 344; + } else { + h_expected(0, 0) = 29; + h_expected(0, 1) = -43; + h_expected(1, 0) = -97; + h_expected(1, 1) = 344; + } + expectedResultIsKnown = true; + } else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(h_A, A); + + if (_useHermitianOption && _A_is_complex) { + // **************************************************************** + // Make h_A Hermitian + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = _KAT_A::conj(h_A(j, i)); + } + } + + for (int i(0); i < _N; ++i) { + h_A(i, i) = 0.5 * (h_A(i, i) + _KAT_A::conj(h_A(i, i))); + } + } else { + // **************************************************************** + // Make h_A symmetric + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + h_A(i, j) = h_A(j, i); + } + } + } + Kokkos::deep_copy(A, h_A); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + if (_useHermitianOption) { + alpha.real() = 1.; + alpha.imag() = 0.; + } else { + alpha.real() = 1.; + alpha.imag() = -1.; + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i].real() = sin(auxI); + h_x[i].imag() = cos(auxI); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_A(i, j).real() = cos(auxImJ); + h_A(i, j).imag() = -sin(auxImJ); + } else { + h_A(i, j).real() = cos(auxImJ); + h_A(i, j).imag() = sin(auxImJ); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = sin(auxIpJ) + cos(auxIpJ); + h_A(i, j).imag() = sin(auxIpJ) - cos(auxIpJ); + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 2. * cos(auxImJ); + h_expected(i, j).imag() = -2. * sin(auxImJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = 2. * sin(auxIpJ); + h_expected(i, j).imag() = 2. * sin(auxIpJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha = 2; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i] = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + for (int j = 0; j < _N; ++j) { + _AuxType auxJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_A(i, j) = 2 * cos(auxI) * cos(auxJ); + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j) = 2 * cos(auxImJ); + } else { + h_expected(i, j) = h_A(i, j); + } + } + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * _KAT_A::conj(h_x(j)) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester::populateVanillaValues( + const T& alpha, const _HostViewTypeX& h_x, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } +} + +template +template +T SyrTester::shrinkAngleToZeroTwoPiRange(const T input) { + T output(input); +#if 0 + T twoPi( 2. * Kokkos::numbers::pi ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + errorHappened = false; + if (h_expected(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + errorHappened = false; + if (h_expected(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "WARNING" << msg.str() << std::endl; +#endif + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "WARNING" << msg.str() << std::endl; +#endif + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + } else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { + if (numErrorsReal == 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << std::endl; +#endif + } + numErrorsReal++; + } + + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { + if (numErrorsImag == 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << std::endl; +#endif + } + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "h_exp(" << i << "," << j << ")=" << h_expected(i, j) + << ", h_van(" << i << "," << j << ")=" << h_vanilla(i, j) + << std::endl; +#endif + } + } + } + + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); + errorHappened = false; + if (h_expected(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "WARNING" << msg.str() << std::endl; +#endif + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j) != h_vanilla(i, j)) { + if (numErrors == 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; +#endif + } + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +SyrTester:: + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; +#endif + } + } + } + + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j).real() - h_A(i, j).real()); + errorHappened = false; + if (h_reference(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + + diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); + errorHappened = false; + if (h_reference(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ", " << h_reference(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ", " << h_reference(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; + } +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "WARNING" << msg.str() << std::endl; +#endif + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "WARNING" << msg.str() << std::endl; +#endif + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +SyrTester:: + compareKkSyrAgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ")=" << h_reference(i, j) + << ", h_A(" << i << "," << j << ")=" << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j) - h_A(i, j)); + errorHappened = false; + if (h_reference(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) + << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "WARNING" << msg.str() << std::endl; +#endif + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void SyrTester:: + callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, + _ViewTypeA& A, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha + << std::endl; +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " + "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); +#else + Kokkos::printf( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::syr(): " + "ViewTypeA = %s, _kkSyrShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkSyrShouldThrowException); +#endif +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + std::string uplo = _useUpOption ? "U" : "L"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr, '" << situation + << "': caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkSyrShouldThrowException) + << "Failed test, '" << situation << "': kk syr() should" + << (_kkSyrShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if ((gotStdException == false) && (gotUnknownException == false)) { + Kokkos::deep_copy(h_A, A); + this->compareKkSyrAgainstReference(alpha, h_A, h_expected); + } +} + +template +template +void SyrTester:: + callKkGerAndCompareKkSyrAgainstIt( + const ScalarA& alpha, TX& x, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _HostViewTypeA& h_A_syr, const std::string& situation) { + view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); + Kokkos::deep_copy(A_ger.d_base, org_A.d_base); + + // ******************************************************************** + // Call ger() + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha + << std::endl; +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); +#else + Kokkos::printf( + "In Test_Blas2_syr.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); +#endif +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, x, A_ger.d_view); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr, '" << situation + << "', ger() call: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr, '" << situation + << "', ger() call: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() should not have thrown a std::exception"; + + // ******************************************************************** + // Prepare h_ger_reference to be compared against h_A_syr + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( + "h_ger_reference", _M, _N); + Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); + + std::string uplo = _useUpOption ? "U" : "L"; + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + // Keep h_ger_reference as already computed + } else { + h_ger_reference.d_view(i, j) = org_A.h_view(i, j); + } + } + } + if (_useHermitianOption && _A_is_complex) { + for (int i(0); i < _N; ++i) { + h_ger_reference.d_view(i, i) = + 0.5 * (h_ger_reference.d_view(i, i) + + _KAT_A::conj(h_ger_reference.d_view(i, i))); + } + } + + // ******************************************************************** + // Compare + // ******************************************************************** + this->compareKkSyrAgainstReference(alpha, h_A_syr, h_ger_reference.d_view); +} + +} // namespace Test + +template +#ifdef HAVE_KOKKOSKERNELS_DEBUG +int test_syr(const std::string& caseName) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); +#else + Kokkos::printf( + "+=======================================================================" + "===\n"); + Kokkos::printf("Starting %s ...\n", caseName.c_str()); +#endif +#else +int test_syr(const std::string& /*caseName*/) { +#endif + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool useAnalyticalResults = xBool && aBool; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); + Kokkos::printf("Starting %s for LAYOUTLEFT ...\n", caseName.c_str()); +#endif +#endif + if (true) { + Test::SyrTester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#else + Kokkos::printf("Finished %s for LAYOUTLEFT\n", caseName.c_str()); + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", + caseName.c_str()); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); + Kokkos::printf("Starting %s for LAYOUTRIGHT ...\n", caseName.c_str()); +#endif +#endif + if (true) { + Test::SyrTester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#else + Kokkos::printf("Finished %s for LAYOUTRIGHT\n", caseName.c_str()); + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", + caseName.c_str()); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); + Kokkos::printf("Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str()); +#endif +#endif + if (true) { + Test::SyrTester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#else + Kokkos::printf("Finished %s for LAYOUTSTRIDE\n", caseName.c_str()); + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", + caseName.c_str()); +#else + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); + Kokkos::printf("Starting %s for MIXED LAYOUTS ...\n", caseName.c_str()); +#endif +#endif + if (true) { + Test::SyrTester + tester; + tester.test(1, 0); + tester.test(2, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + } + + if (true) { + Test::SyrTester + tester; + tester.test(1024, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); +#else + Kokkos::printf("Finished %s for MIXED LAYOUTS\n", caseName.c_str()); + Kokkos::printf( + "+-----------------------------------------------------------------------" + "---\n"); +#endif +#endif +#endif + +#ifdef HAVE_KOKKOSKERNELS_DEBUG +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); +#else + Kokkos::printf("Finished %s\n", caseName.c_str()); + Kokkos::printf( + "+=======================================================================" + "===\n"); +#endif +#endif + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_float"); + test_syr("test case syr_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_float"); + test_syr, Kokkos::complex, TestDevice>( + "test case syr_complex_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_double"); + test_syr("test case syr_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_complex_double"); + test_syr, Kokkos::complex, TestDevice>( + "test case syr_complex_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int"); + test_syr("test case syr_int"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, syr_int_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr_int_float"); + test_syr("test case syr_int_float"); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index 13c52ec437..cd91bc6d95 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -81,7 +81,7 @@ void build_matrices(const int M, const int N, const int K, const typename ViewTypeA::value_type alpha, ViewTypeA& A, ViewTypeB& B, const typename ViewTypeA::value_type beta, ViewTypeC& C, ViewTypeC& Cref) { - using execution_space = TestExecSpace; + using execution_space = typename TestDevice::execution_space; using ScalarA = typename ViewTypeA::non_const_value_type; using ScalarB = typename ViewTypeB::non_const_value_type; using ScalarC = typename ViewTypeC::non_const_value_type; @@ -257,15 +257,16 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } } -template +template void impl_test_stream_gemm_psge2(const int M, const int N, const int K, const Scalar alpha, const Scalar beta) { - using ViewTypeA = Kokkos::View; - using ViewTypeB = Kokkos::View; - using ViewTypeC = Kokkos::View; - using ScalarC = typename ViewTypeC::value_type; - using APT = Kokkos::ArithTraits; - using mag_type = typename APT::mag_type; + using execution_space = typename Device::execution_space; + using ViewTypeA = Kokkos::View; + using ViewTypeB = Kokkos::View; + using ViewTypeC = Kokkos::View; + using ScalarC = typename ViewTypeC::value_type; + using APT = Kokkos::ArithTraits; + using mag_type = typename APT::mag_type; const char tA[] = {"N"}; const char tB[] = {"N"}; @@ -336,9 +337,10 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, template void test_gemm() { - typedef Kokkos::View view_type_a; - typedef Kokkos::View view_type_b; - typedef Kokkos::View view_type_c; + typedef typename TestDevice::execution_space execution_space; + typedef Kokkos::View view_type_a; + typedef Kokkos::View view_type_b; + typedef Kokkos::View view_type_c; std::vector modes = {"N", "T"}; if (std::is_same>::value || std::is_same>::value) @@ -348,37 +350,32 @@ void test_gemm() { for (Scalar beta : betas) { for (auto amode : modes) { for (auto bmode : modes) { - Test::impl_test_gemm(amode, bmode, 0, 0, 0, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 0, 0, 0, alpha, beta); // BMK: N = 1 exercises the special GEMV code path in GEMM (currently, // only for modes N/N) - Test::impl_test_gemm(amode, bmode, 50, 1, 40, alpha, - beta); + Test::impl_test_gemm( + amode, bmode, 50, 1, 40, alpha, beta); // LBV: K = 0 exercise the quick return code path in GEMM - Test::impl_test_gemm(amode, bmode, 20, 14, 0, alpha, - beta); - Test::impl_test_gemm(amode, bmode, 13, 15, 17, alpha, - beta); - Test::impl_test_gemm(amode, bmode, 179, 15, 211, alpha, - beta); - Test::impl_test_gemm(amode, bmode, 12, 3071, 517, alpha, - beta); + Test::impl_test_gemm( + amode, bmode, 20, 14, 0, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 13, 15, 17, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 179, 15, 211, alpha, beta); + Test::impl_test_gemm( + amode, bmode, 12, 3071, 517, alpha, beta); } } } - auto pool_size = TestExecSpace().concurrency(); + auto pool_size = execution_space().concurrency(); if (pool_size >= 2) { - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 53, 42, 17, 4.5, 3.0); // General code path - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 7, 13, 17, 4.5, 3.0); // dot based gemm code path } @@ -400,8 +397,8 @@ void test_gemm_enabled_layouts() { template void test_gemm_mixed_scalars() { - using Matrix1 = Kokkos::View; - using Matrix2 = Kokkos::View; + using Matrix1 = Kokkos::View; + using Matrix2 = Kokkos::View; const int dim1 = 400, dim2 = 1000; @@ -414,8 +411,8 @@ void test_gemm_mixed_scalars() { Kokkos::deep_copy(B, Kokkos::ArithTraits::one()); Kokkos::deep_copy(C, Kokkos::ArithTraits::one()); - KokkosBlas::gemm(TestExecSpace(), "N", "N", 1.0, D, A, 0.0, C); - KokkosBlas::gemm(TestExecSpace(), "N", "T", 1.0, C, D, 0.0, B); + KokkosBlas::gemm(TestDevice(), "N", "N", 1.0, D, A, 0.0, C); + KokkosBlas::gemm(TestDevice(), "N", "T", 1.0, C, D, 0.0, B); } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index 188999c5e0..a186835aaa 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -259,42 +259,42 @@ int test_trmm(const char* mode, ScalarA alpha) { TEST_F(TestCategory, trmm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_float"); float alpha = 1.0f; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); alpha = 4.5f; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -305,42 +305,42 @@ TEST_F(TestCategory, trmm_float) { TEST_F(TestCategory, trmm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_double"); double alpha = 1.0; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); alpha = 4.5; - test_trmm("LLNN", alpha); - test_trmm("LLNU", alpha); - test_trmm("LLTN", alpha); - test_trmm("LLTU", alpha); - test_trmm("LUNN", alpha); - test_trmm("LUNU", alpha); - test_trmm("LUTN", alpha); - test_trmm("LUTU", alpha); - - test_trmm("RLNN", alpha); - test_trmm("RLNU", alpha); - test_trmm("RLTN", alpha); - test_trmm("RLTU", alpha); - test_trmm("RUNN", alpha); - test_trmm("RUNU", alpha); - test_trmm("RUTN", alpha); - test_trmm("RUTU", alpha); + test_trmm("LLNN", alpha); + test_trmm("LLNU", alpha); + test_trmm("LLTN", alpha); + test_trmm("LLTU", alpha); + test_trmm("LUNN", alpha); + test_trmm("LUNU", alpha); + test_trmm("LUTN", alpha); + test_trmm("LUTU", alpha); + + test_trmm("RLNN", alpha); + test_trmm("RLNU", alpha); + test_trmm("RLTN", alpha); + test_trmm("RLTU", alpha); + test_trmm("RUNN", alpha); + test_trmm("RUNU", alpha); + test_trmm("RUTN", alpha); + test_trmm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -351,194 +351,194 @@ TEST_F(TestCategory, trmm_double) { ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNU", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCN", 1.0); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCU", 1.0); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_double_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCN", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_double_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCU", Kokkos::complex(4.5, 0.0)); Kokkos::Profiling::popRegion(); } @@ -550,194 +550,194 @@ TEST_F(TestCategory, trmm_complex_double_RUCU_fourfive) { ///////////////// alpha 1.0 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LLCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LLCU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "LUCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("LUCU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RLCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RLCU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUNN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUNU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUNU", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUCN", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCN", + 1.0f); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_one) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( - "RUCU", 1.0f); + test_trmm, Kokkos::complex, TestDevice>("RUCU", + 1.0f); Kokkos::Profiling::popRegion(); } ///////////////// alpha 4.5 ///////////////// TEST_F(TestCategory, trmm_complex_float_LLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_LUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_LUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "LUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RLCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RLCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RLCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUNU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUNU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUNU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCN_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCN"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCN", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, trmm_complex_float_RUCU_fourfive) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trmm_complex_float_RUCU"); - test_trmm, Kokkos::complex, TestExecSpace>( + test_trmm, Kokkos::complex, TestDevice>( "RUCU", Kokkos::complex(4.5f, 0.0f)); Kokkos::Profiling::popRegion(); } diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 5edd175652..9a00f22263 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -261,42 +261,42 @@ int test_trsm(const char* mode, ScalarA alpha) { TEST_F(TestCategory, trsm_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_float"); float alpha = 1.0f; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); alpha = 4.5f; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -307,42 +307,42 @@ TEST_F(TestCategory, trsm_float) { TEST_F(TestCategory, trsm_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_double"); double alpha = 1.0; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); alpha = 4.5; - test_trsm("LLNN", alpha); - test_trsm("LLNU", alpha); - test_trsm("LLTN", alpha); - test_trsm("LLTU", alpha); - test_trsm("LUNN", alpha); - test_trsm("LUNU", alpha); - test_trsm("LUTN", alpha); - test_trsm("LUTU", alpha); - - test_trsm("RLNN", alpha); - test_trsm("RLNU", alpha); - test_trsm("RLTN", alpha); - test_trsm("RLTU", alpha); - test_trsm("RUNN", alpha); - test_trsm("RUNU", alpha); - test_trsm("RUTN", alpha); - test_trsm("RUTU", alpha); + test_trsm("LLNN", alpha); + test_trsm("LLNU", alpha); + test_trsm("LLTN", alpha); + test_trsm("LLTU", alpha); + test_trsm("LUNN", alpha); + test_trsm("LUNU", alpha); + test_trsm("LUTN", alpha); + test_trsm("LUTU", alpha); + + test_trsm("RLNN", alpha); + test_trsm("RLNU", alpha); + test_trsm("RLTN", alpha); + test_trsm("RLTU", alpha); + test_trsm("RUNN", alpha); + test_trsm("RUNU", alpha); + test_trsm("RUTN", alpha); + test_trsm("RUTU", alpha); Kokkos::Profiling::popRegion(); } #endif @@ -353,73 +353,73 @@ TEST_F(TestCategory, trsm_double) { TEST_F(TestCategory, trsm_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_double"); Kokkos::complex alpha = 1.0; - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCU", alpha); alpha = Kokkos::complex(4.5, 0.0); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "LUCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( + test_trsm, Kokkos::complex, TestDevice>( "RUCU", alpha); Kokkos::Profiling::popRegion(); } @@ -431,74 +431,74 @@ TEST_F(TestCategory, trsm_complex_double) { TEST_F(TestCategory, trsm_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::trsm_complex_float"); Kokkos::complex alpha = 1.0f; - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", + alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", + alpha); alpha = Kokkos::complex(4.5f, 0.0f); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "LUCU", alpha); - - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RLCU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUNU", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCN", alpha); - test_trsm, Kokkos::complex, TestExecSpace>( - "RUCU", alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("LUCU", + alpha); + + test_trsm, Kokkos::complex, TestDevice>("RLNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RLCU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUNU", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCN", + alpha); + test_trsm, Kokkos::complex, TestDevice>("RUCU", + alpha); Kokkos::Profiling::popRegion(); } #endif diff --git a/blas/unit_test/Test_Blas_Newton.hpp b/blas/unit_test/Test_Blas_Newton.hpp index 001a6b2395..5bb6946e99 100644 --- a/blas/unit_test/Test_Blas_Newton.hpp +++ b/blas/unit_test/Test_Blas_Newton.hpp @@ -191,8 +191,8 @@ int test_intersection() { template int test_newton() { - Test::test_logistic(); - Test::test_intersection(); + Test::test_logistic(); + Test::test_intersection(); return 1; } diff --git a/blas/unit_test/Test_Blas_serial_axpy.hpp b/blas/unit_test/Test_Blas_serial_axpy.hpp index 48b417c96d..427925a3dc 100644 --- a/blas/unit_test/Test_Blas_serial_axpy.hpp +++ b/blas/unit_test/Test_Blas_serial_axpy.hpp @@ -32,6 +32,7 @@ struct NaiveAxpyTag {}; template struct Functor_TestBlasSerialAxpy { + using execution_space = typename DeviceType::execution_space; ScalarType _alpha; ViewType _x; ViewType _y; @@ -71,7 +72,7 @@ struct Functor_TestBlasSerialAxpy { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return; @@ -155,35 +156,35 @@ int test_blas_serial_axpy() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, serial_axpy_float_float) { - test_blas_serial_axpy(); + test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, serial_axpy_double_double) { - test_blas_serial_axpy(); + test_blas_serial_axpy(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) { - test_blas_serial_axpy, + test_blas_serial_axpy, Kokkos::complex >(); } TEST_F(TestCategory, serial_axpy_dcomplex_double) { - test_blas_serial_axpy, double>(); + test_blas_serial_axpy, double>(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) { - test_blas_serial_axpy, + test_blas_serial_axpy, Kokkos::complex >(); } TEST_F(TestCategory, serial_axpy_fcomplex_float) { - test_blas_serial_axpy, float>(); + test_blas_serial_axpy, float>(); } #endif diff --git a/blas/unit_test/Test_Blas_serial_nrm2.hpp b/blas/unit_test/Test_Blas_serial_nrm2.hpp index a4af218ff3..147df52353 100644 --- a/blas/unit_test/Test_Blas_serial_nrm2.hpp +++ b/blas/unit_test/Test_Blas_serial_nrm2.hpp @@ -70,7 +70,7 @@ struct Functor_TestBlasSerialNrm2 { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return; @@ -125,7 +125,7 @@ struct Functor_TestBlasSerialNrm2MV { std::string name = name_region + name_value_type + name_work_tag + name_test_id; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::RangePolicy policy(0, _x.extent(0)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); return; @@ -263,25 +263,25 @@ int test_blas_serial_nrm2() { #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, serial_nrm2_float_float) { - test_blas_serial_nrm2(); + test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, serial_nrm2_double_double) { - test_blas_serial_nrm2(); + test_blas_serial_nrm2(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) TEST_F(TestCategory, serial_nrm2_fcomplex_float) { - test_blas_serial_nrm2 >(); + test_blas_serial_nrm2 >(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { - test_blas_serial_nrm2 >(); + test_blas_serial_nrm2 >(); } #endif diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 913b4e67a5..3358ae2eb8 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -367,8 +367,11 @@ display_help_text() { echo "--disable-perftests: Do not build Kokkos Kernels performance tests" echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" + echo "--deprecated-code-warnings Enable deprecated code warnings (disabled by default)" echo "--export-compile-commands: export cmake compile_commands.json file" echo "--enable-docs: build the Kokkos Kernels developer documentation (requires sphinx, doxygen)" + echo "--cmake-flags=[CMAKE Command options]: Set Kokkos Kernels cmake options not handled by script" + echo "--kokkos-cmake-flags=[CMAKE Command options]: Set Kokkos cmake options not handled by script" } @@ -385,6 +388,7 @@ KOKKOSKERNELS_DO_DOCS=OFF CMAKE_EXPORT_COMPILE_COMMANDS=OFF #Build static libraries by default +# Shared libraries are required for Sycl on Intel BUILD_SHARED_LIBRARIES=OFF KOKKOS_MAKEINSTALL_J=4 @@ -396,6 +400,7 @@ WITH_CUDA_BACKEND=OFF WITH_HIP_BACKEND=OFF KOKKOS_DEPRECATED_CODE=OFF +KOKKOS_DEPRECATED_CODE_WARNINGS=OFF while [[ $# > 0 ]] do @@ -523,6 +528,12 @@ do --release) KOKKOSKERNELS_RELEASE=ON ;; + --cmake-flags*) + PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; + --kokkos-cmake-flags*) + KOKKOS_PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; --kokkos-make-j*) echo "${key} parallel level for kokkos install" KOKKOS_MAKEINSTALL_J="${key#*=}" @@ -574,6 +585,9 @@ do --deprecated-code) KOKKOS_DEPRECATED_CODE=ON ;; + --deprecated-code-warnings) + KOKKOS_DEPRECATED_CODE_WARNINGS=ON + ;; --enable-docs) KOKKOSKERNELS_DO_DOCS=ON ;; @@ -797,9 +811,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J @@ -824,6 +838,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOSKERNELS_PATH} diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index fbceffe76c..9b649d26c6 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -11,3 +11,13 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) INCLUDE("${KokkosKernels_CMAKE_DIR}/KokkosKernelsTargets.cmake") +IF(NOT TARGET KokkosKernels::all_libs) + # CMake Error at /lib/cmake/Kokkos/KokkosConfigCommon.cmake:10 (ADD_LIBRARY): + # ADD_LIBRARY cannot create ALIAS target "Kokkos::all_libs" because target + # "KokkosKernels::kokkoskernels" is imported but not globally visible. + IF(CMAKE_VERSION VERSION_LESS "3.18") + SET_TARGET_PROPERTIES(Kokkos::kokkoskernels PROPERTIES IMPORTED_GLOBAL ON) + ENDIF() + ADD_LIBRARY(KokkosKernels::all_libs ALIAS Kokkos::kokkoskernels) + ADD_LIBRARY(KokkosKernels::kokkoskernels ALIAS Kokkos::kokkoskernels) +ENDIF() diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 22b7a196fc..d94860e380 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -29,6 +29,7 @@ requires (a) header file(s) as well, and may use functions other than just BLAS and LAPACK functions. */ #cmakedefine HAVE_KOKKOSKERNELS_MKL +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE #cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK @@ -49,14 +50,14 @@ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_CUDA #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE -/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +/* Whether to build kernels for execution space Kokkos::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE /* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE -/* Whether to build kernels for execution space Kokkos::Experimental::HIP */ +/* Whether to build kernels for execution space Kokkos::Experimental::OpenMPTarget */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMPTARGET #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_OPENMPTARGETSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ @@ -109,6 +110,8 @@ /* BLAS library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_BLAS +/* LAPACK */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL /* CUSPARSE */ diff --git a/cmake/fake_tribits.cmake b/cmake/fake_tribits.cmake index 4a44ffea86..52e9c00b72 100644 --- a/cmake/fake_tribits.cmake +++ b/cmake/fake_tribits.cmake @@ -250,14 +250,6 @@ FUNCTION(KOKKOSKERNELS_ADD_ADVANCED_TEST) ENDIF() ENDFUNCTION() -MACRO(KOKKOSKERNELS_EXCLUDE_AUTOTOOLS_FILES) - IF (KOKKOSKERNELS_HAS_TRILINOS) - TRIBITS_EXCLUDE_AUTOTOOLS_FILES() - ELSE() - #DO nothing - ENDIF() -ENDMACRO(KOKKOSKERNELS_EXCLUDE_AUTOTOOLS_FILES) - FUNCTION(KOKKOSKERNELS_LIB_TYPE LIB RET) GET_TARGET_PROPERTY(PROP ${LIB} TYPE) IF (${PROP} STREQUAL "INTERFACE_LIBRARY") diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake index 3a38feee88..07f0515b4e 100644 --- a/cmake/kokkoskernels_benchmarks.cmake +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -18,11 +18,27 @@ ELSE() # Note: recent bug (google/benchmark#1441) is preventing us from using # the latest benchmark release. SET(BENCHMARK_VERSION 1.6.2) - FetchContent_Declare( - googlebenchmark - URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b - ) + + # CMake 3.24 introduced DOWNLOAD_EXTRACT_TIMESTAMP, which controls whether + # extracting this file archive sets the file times to archive time (TRUE), + # or to extraction time (FALSE). + # In CMake 3.24+, the default is FALSE + # Prior, it did not exist, and was effectively TRUE + # Here, we okay the new default to silence CMP0135 warning + IF (${CMAKE_VERSION} VERSION_LESS "3.24.0") + FetchContent_Declare( + googlebenchmark + URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz + URL_HASH MD5=14d14849e075af116143a161bc3b927b + ) + ELSE() + FetchContent_Declare( + googlebenchmark + URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz + URL_HASH MD5=14d14849e075af116143a161bc3b927b + DOWNLOAD_EXTRACT_TIMESTAMP FALSE + ) + ENDIF() FetchContent_MakeAvailable(googlebenchmark) LIST(POP_BACK CMAKE_MESSAGE_INDENT) diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index 84c68658b7..49bc2f4ae6 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -29,6 +29,13 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to build the blas component. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + "ENABLE_COMPONENT_LAPACK" + OFF + BOOL + "Whether to build the lapack component. Default: OFF" +) + # SPARSE depends on everything else at the moment. KOKKOSKERNELS_ADD_OPTION( "ENABLE_COMPONENT_SPARSE" @@ -67,6 +74,7 @@ ENDIF() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_LAPACK ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() @@ -74,6 +82,7 @@ ENDIF() IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_LAPACK ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_ODE ON CACHE BOOL "" FORCE) @@ -85,6 +94,7 @@ ENDIF() # but marking it as advanced should hide it from GUIs IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED AND KokkosKernels_ENABLE_COMPONENT_BLAS + AND KokkosKernels_ENABLE_COMPONENT_LAPACK AND KokkosKernels_ENABLE_COMPONENT_GRAPH AND KokkosKernels_ENABLE_COMPONENT_SPARSE AND KokkosKernels_ENABLE_COMPONENT_ODE) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index d223e00171..8c6cb540ae 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -12,7 +12,7 @@ SET(EXEC_SPACES EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) -SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) +SET(EXECSPACE_HIP_CPP_TYPE Kokkos::HIP) SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMPTARGET_CPP_TYPE Kokkos::Experimental::OpenMPTarget) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) @@ -31,7 +31,7 @@ SET(MEM_SPACES ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) -SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::HIPSpace) SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) SET(MEMSPACE_OPENMPTARGETSPACE_CPP_TYPE Kokkos::Experimental::OpenMPTargetSpace) @@ -77,13 +77,13 @@ IF(KOKKOS_ENABLE_HIP) INST_EXECSPACE_HIP ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::HIP. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HIPSPACE ${KOKKOSKERNELS_INST_EXECSPACE_HIP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the memory space Kokkos::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index f650168757..08c7158148 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -31,6 +31,10 @@ MACRO(KOKKOSKERNELS_ADD_TPL_OPTION NAME DEFAULT_VALUE DOCSTRING) SET(ROOT_DEFAULT $ENV{${_NAME_ORIG}_ROOT}) KOKKOSKERNELS_ADD_OPTION(${_NAME_ORIG}_ROOT "${ROOT_DEFAULT}" PATH "Location of ${_NAME} install root. Default: None or the value of the environment variable ${_NAME}_ROOT if set") IF (DEFINED TPL_ENABLE_${_NAME}) + IF (${_NAME} STREQUAL MKL AND KOKKOSKERNELS_HAS_TRILINOS) + MESSAGE("Trilinos has enabled MKL and SYCL but it does not detect oneMKL correctly so we disable it!") + SET(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE ON) + ENDIF () IF (TPL_ENABLE_${_NAME} AND NOT KOKKOSKERNELS_ENABLE_TPL_${_NAME}) MESSAGE("Overriding KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG}=OFF with TPL_ENABLE_${_NAME}=ON") SET(KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG} ON) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 88bf237274..b065869296 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,3 +1,4 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/impl) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) diff --git a/common/impl/KokkosKernels_AlwaysFalse.hpp b/common/impl/KokkosKernels_AlwaysFalse.hpp new file mode 100644 index 0000000000..12acf4a524 --- /dev/null +++ b/common/impl/KokkosKernels_AlwaysFalse.hpp @@ -0,0 +1,28 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP +#define KOKKOSKERNELS_ALWAYSFALSE_HPP + +namespace KokkosKernels::Impl { + +// for use in static asserts +template +inline constexpr bool always_false_v = false; + +} // namespace KokkosKernels::Impl + +#endif // KOKKOSKERNELS_ALWAYSFALSE_HPP diff --git a/common/src/KokkosKernels_Iota.hpp b/common/impl/KokkosKernels_Iota.hpp similarity index 98% rename from common/src/KokkosKernels_Iota.hpp rename to common/impl/KokkosKernels_Iota.hpp index 5b7e24ca24..04851e81c9 100644 --- a/common/src/KokkosKernels_Iota.hpp +++ b/common/impl/KokkosKernels_Iota.hpp @@ -55,7 +55,7 @@ class Iota { public: using size_type = SizeType; using value_type = T; - using non_const_value_type = std::remove_const; + using non_const_value_type = std::remove_const_t; using device_type = void; using data_type = const value_type *; diff --git a/common/impl/KokkosKernels_SafeCompare.hpp b/common/impl/KokkosKernels_SafeCompare.hpp new file mode 100644 index 0000000000..494ef45ada --- /dev/null +++ b/common/impl/KokkosKernels_SafeCompare.hpp @@ -0,0 +1,81 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_SAFECOMPARE_HPP +#define KOKKOSKERNELS_SAFECOMPARE_HPP + +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosKernels { +namespace Impl { + +/*! \brief t > u + + When comparing signed and unsigned types of the same size, the signed type + is converted to unsigned which produces strange behavior like int32_t(-1) > + uint32_t(1) This function casts its arguments to types that can represent + the full range of both argument types, before comparing. + + Basically this boils down to: + 1. forbidding any comparisons between signed integers and uint64_t, + since there's no reliable signed integer type larger than 64 bits. + 2. Using a type large enough to represent both sides of a comparison + otherwise. + + If T and A are ints, and T xor U is signed, choose a signed type large + enough to represent all values of both T and U + + This function does not protect you from casting an int to a float where that + value is not representable. +*/ +template +KOKKOS_INLINE_FUNCTION constexpr bool safe_gt(const T &t, const U &u) { + using KT = Kokkos::ArithTraits; + using KU = Kokkos::ArithTraits; + + // both are integer, but only one is signed + if constexpr (KT::is_integer && KU::is_integer && + (KT::is_signed != KU::is_signed)) { + // how wide the signed type would need to be to hold T and U + constexpr size_t t_width = KT::is_signed ? sizeof(T) : 2 * sizeof(T); + constexpr size_t u_width = KU::is_signed ? sizeof(U) : 2 * sizeof(U); + + // compare using the max width + constexpr size_t width = KOKKOSKERNELS_MACRO_MAX(t_width, u_width); + if constexpr (width == 1) { + return int8_t(t) > int8_t(u); + } else if constexpr (width == 2) { + return int16_t(t) > int16_t(u); + } else if constexpr (width == 4) { + return int32_t(t) > int32_t(u); + } else if constexpr (width == 8) { + return int64_t(t) > int64_t(u); + } else { + static_assert(std::is_same_v, "no safe way to compare types"); + } + } else { + // use whatever the default comparison rules are + return t > u; + } + + // CUDA 11.2 issues a spurious missing return warning + return false; +} + +} // namespace Impl +} // namespace KokkosKernels + +#endif // KOKKOSKERNELS_SAFECOMPARE_HPP \ No newline at end of file diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp new file mode 100644 index 0000000000..ac4abb6457 --- /dev/null +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -0,0 +1,65 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_VIEWUTILS_HPP +#define KOKKOSKERNELS_VIEWUTILS_HPP +#include "Kokkos_Core.hpp" + +namespace KokkosKernels::Impl { +// lbv - 07/26/2023: +// MemoryTraits::impl_value was added +// in Kokkos 4.1.00 so we should guard +// the content of this header until v4.3.0 +#if KOKKOS_VERSION >= 40100 || defined(DOXY) + +/*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory + * traits + */ +template +class with_unmanaged { + using data_type = typename View::data_type; + using layout_type = typename View::array_layout; + using memory_space = typename View::memory_space; + + using orig_traits = typename View::memory_traits; + static constexpr unsigned new_traits = + orig_traits::impl_value | Kokkos::Unmanaged; + + public: + using type = Kokkos::View >; +}; + +/*! \brief A type that is View with Kokkos::Unmanaged added to the memory traits + + \tparam View the type to add Kokkos::Unmanaged to + */ +template +using with_unmanaged_t = typename with_unmanaged::type; + +/*! \brief Returns an unmanaged version of v + + \tparam View the type of the input view v + */ +template +auto make_unmanaged(const View &v) { + return typename with_unmanaged::type(v); +} + +#endif // KOKKOS_VERSION >= 40100 +} // namespace KokkosKernels::Impl + +#endif diff --git a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp index f275bd007a..3ca160164c 100644 --- a/common/src/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/common/src/KokkosKernels_BlockHashmapAccumulator.hpp @@ -250,9 +250,8 @@ struct BlockHashmapAccumulator { KokkosSparse::Impl::kk_block_set_mul( block_dim, values + my_write_index * block_size, valA, valB); -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -327,9 +326,8 @@ struct BlockHashmapAccumulator { KokkosSparse::Impl::kk_block_set_mul( block_dim, values + my_write_index * block_size, valA, valB); -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -407,9 +405,8 @@ struct BlockHashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -460,9 +457,8 @@ struct BlockHashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -514,9 +510,8 @@ struct BlockHashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -564,9 +559,8 @@ struct BlockHashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 4d732a8437..df8b21b8df 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -18,6 +18,7 @@ #define KOKKOSKERNELS_ERROR_HPP #include +#include namespace KokkosKernels { namespace Impl { @@ -79,6 +80,7 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } while (0) // SYCL cannot printf like the other backends quite yet +#if KOKKOS_VERSION < 40199 #define IMPL_KERNEL_THROW(condition, msg) \ do { \ if (!(condition)) { \ @@ -87,6 +89,15 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, Kokkos::abort(""); \ } \ } while (0) +#else +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + Kokkos::printf("KERNEL CHECK FAILED:\n %s\n %s\n", #condition, msg); \ + Kokkos::abort(""); \ + } \ + } while (0) +#endif #ifndef NDEBUG #define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index a0f6e39f4d..2ec09f4069 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -66,7 +66,7 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) { + if (std::is_same::value) { exec_space = Exec_HIP; } #endif @@ -98,8 +98,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { #ifdef KOKKOS_ENABLE_HIP template <> -constexpr KOKKOS_INLINE_FUNCTION bool -kk_is_gpu_exec_space() { +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { return true; } #endif @@ -157,29 +156,68 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */, throw std::runtime_error(oss.str()); } +// Host function to determine free and total device memory. +// Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& /* free_mem */, + size_t& /* total_mem */, + int /* n_streams */) { + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() + << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + #ifdef KOKKOS_ENABLE_CUDA template <> inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { + size_t& total_mem, + int n_streams) { cudaMemGetInfo(&free_mem, &total_mem); + free_mem /= n_streams; + total_mem /= n_streams; +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + kk_get_free_total_memory(free_mem, total_mem, 1); +} +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif #ifdef KOKKOS_ENABLE_HIP template <> -inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); + free_mem /= n_streams; + total_mem /= n_streams; +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif @@ -188,7 +226,7 @@ inline void kk_get_free_total_memory( #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams) { sycl::queue queue; sycl::device device = queue.get_device(); auto level_zero_handle = @@ -220,20 +258,43 @@ inline void kk_get_free_total_memory( total_mem += memory_states.size; free_mem += memory_states.free; } + free_mem /= n_streams; + total_mem /= n_streams; +} + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + kk_get_free_total_memory( + free_mem, total_mem, 1); } #endif @@ -343,13 +404,13 @@ struct SpaceInstance { #ifdef KOKKOS_ENABLE_HIP template <> -struct SpaceInstance { - static Kokkos::Experimental::HIP create() { +struct SpaceInstance { + static Kokkos::HIP create() { hipStream_t stream; KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); - return Kokkos::Experimental::HIP(stream); + return Kokkos::HIP(stream); } - static void destroy(Kokkos::Experimental::HIP& space) { + static void destroy(Kokkos::HIP& space) { hipStream_t stream = space.hip_stream(); KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream)); } diff --git a/common/src/KokkosKernels_Half.hpp b/common/src/KokkosKernels_Half.hpp index eddd2f1e4c..c22646b5aa 100644 --- a/common/src/KokkosKernels_Half.hpp +++ b/common/src/KokkosKernels_Half.hpp @@ -14,6 +14,7 @@ // //@HEADER +#if KOKKOS_VERSION < 40199 #ifndef KOKKOSKERNELS_HALF_HPP #define KOKKOSKERNELS_HALF_HPP @@ -61,3 +62,4 @@ namespace Experimental { } // namespace Experimental } // namespace KokkosKernels #endif // KOKKOSKERNELS_HALF_HPP +#endif // KOKKOS_VERSION < 40199 diff --git a/common/src/KokkosKernels_HashmapAccumulator.hpp b/common/src/KokkosKernels_HashmapAccumulator.hpp index 3a12d399f2..1085cec4af 100644 --- a/common/src/KokkosKernels_HashmapAccumulator.hpp +++ b/common/src/KokkosKernels_HashmapAccumulator.hpp @@ -16,6 +16,7 @@ #ifndef _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP #define _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP #include +#include "KokkosKernels_Macros.hpp" #include namespace KokkosKernels { @@ -412,9 +413,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA because warps do not go in SIMD fashion +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ because warps do not go in SIMD fashion // anymore. while some thread might insert my_write_index into linked // list, another thread in the warp might be reading keys in above loop. // before inserting the new value in liked list -- which is done with @@ -483,9 +483,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -601,9 +600,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -679,9 +677,8 @@ struct HashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -732,9 +729,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -786,9 +782,8 @@ struct HashmapAccumulator { keys[my_write_index] = key; values[my_write_index] = value; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done @@ -836,9 +831,8 @@ struct HashmapAccumulator { } else { keys[my_write_index] = key; -#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ - defined(KOKKOS_ARCH_AMPERE) - // this is an issue on VOLTA and up because warps do not go in SIMD +#ifdef KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS + // this is an issue on VOLTA+ and up because warps do not go in SIMD // fashion anymore. while some thread might insert my_write_index into // linked list, another thread in the warp might be reading keys in above // loop. before inserting the new value in liked list -- which is done diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp index 22df9545ef..e091932453 100644 --- a/common/src/KokkosKernels_LowerBound.hpp +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -77,8 +77,8 @@ namespace Impl { /*! \brief Single-thread sequential lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -96,9 +96,6 @@ lower_bound_sequential_thread( using size_type = typename ViewLike::size_type; static_assert(1 == ViewLike::rank, "lower_bound_sequential_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_sequential_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); size_type i = 0; while (i < view.size() && pred(view(i), value)) { @@ -109,8 +106,8 @@ lower_bound_sequential_thread( /*! \brief Single-thread binary lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -127,9 +124,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( using size_type = typename ViewLike::size_type; static_assert(1 == ViewLike::rank, "lower_bound_binary_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_binary_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); size_type lo = 0; size_type hi = view.size(); @@ -150,8 +144,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( /*! \brief single-thread lower-bound search - \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota - \tparam Pred a binary predicate function + \tparam ViewLike A Kokkos::View, KokkosKernels::Impl::Iota, or + KokkosSparse::MergeMatrixDiagonal \tparam Pred a binary predicate function \param view the view to search \param value the value to search for \param pred a binary predicate function @@ -168,9 +162,6 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_thread requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, - "lower_bound_thread requires a " - "KokkosKernels::Impl::Iota or a Kokkos::View"); /* sequential search makes on average 0.5 * view.size memory accesses binary search makes log2(view.size)+1 accesses @@ -448,7 +439,8 @@ KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_team( const TeamMember &handle, const ViewLike &view, const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { static_assert(1 == ViewLike::rank, "lower_bound_team requires rank-1 views"); - static_assert(is_iota_v || Kokkos::is_view::value, + static_assert(KokkosKernels::Impl::is_iota_v || + Kokkos::is_view::value, "lower_bound_team requires a " "KokkosKernels::Impl::Iota or a Kokkos::View"); diff --git a/common/src/KokkosKernels_Macros.hpp b/common/src/KokkosKernels_Macros.hpp index d7f7af5a79..04234a5ce2 100644 --- a/common/src/KokkosKernels_Macros.hpp +++ b/common/src/KokkosKernels_Macros.hpp @@ -96,4 +96,13 @@ #endif // KOKKOS_COMPILER_GNU /******* END other helper macros *******/ +// define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS if we are targeting a CUDA +// architecture with "independent thread scheduling" (Volta70 and up). This +// requires some extra logic in HashmapAccumulator to avoid data races. +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_ADA89) || \ + defined(KOKKOS_ARCH_HOPPER) +#define KOKKOSKERNELS_CUDA_INDEPENDENT_THREADS +#endif + #endif // KOKKOSKERNELS_MACROS_HPP_ diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index a271695246..60bdd097e1 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -84,7 +84,7 @@ struct InclusiveParallelPrefixSum { * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. */ -template +template inline void kk_exclusive_parallel_prefix_sum( const MyExecSpace &exec, typename view_t::value_type num_elements, view_t arr) { @@ -100,7 +100,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. */ -template +template inline void kk_exclusive_parallel_prefix_sum( typename view_t::value_type num_elements, view_t arr) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr); @@ -116,7 +116,7 @@ inline void kk_exclusive_parallel_prefix_sum( * \param finalSum: will be set to arr[num_elements - 1] after computing the * prefix sum. */ -template +template inline void kk_exclusive_parallel_prefix_sum( const MyExecSpace &exec, typename view_t::value_type num_elements, view_t arr, typename view_t::non_const_value_type &finalSum) { @@ -135,29 +135,45 @@ inline void kk_exclusive_parallel_prefix_sum( * \param finalSum: will be set to arr[num_elements - 1] after computing the * prefix sum. */ -template +template inline void kk_exclusive_parallel_prefix_sum( typename view_t::value_type num_elements, view_t arr, typename view_t::non_const_value_type &finalSum) { kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } -/*** - * \brief Function performs the inclusive parallel prefix sum. That is each - * entry holds the sum until itself including itself. \param num_elements: size - * of the array \param arr: the array for which the prefix sum will be - * performed. - */ -template +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param my_exec_space: The execution space instance +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// +template void kk_inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, forward_array_type arr) { - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// +template +void kk_inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); +} + template struct ReductionFunctor { view_t array_sum; @@ -324,11 +340,19 @@ struct IsRelativelyIdenticalFunctor { } if (val_diff > mag_type(eps)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " "(eps = %e)\n", (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), KAT::imag(view2(i)), eps); +#else + Kokkos::printf( + "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " + "(eps = %e)\n", + (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), + KAT::imag(view2(i)), eps); +#endif num_diffs++; } } diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 2a4b749f92..e1c15505ff 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -456,18 +456,25 @@ struct Fill_Reverse_Map { template void inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, forward_array_type arr) { - kk_inclusive_parallel_prefix_sum( - num_elements, arr); + return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); +} + +template +void inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template void exclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, forward_array_type arr) { - kk_exclusive_parallel_prefix_sum( - num_elements, arr); + kk_exclusive_parallel_prefix_sum(num_elements, arr); } template @@ -661,6 +668,7 @@ struct StridedCopy { template void create_reverse_map( + MyExecSpace my_exec_space, const typename reverse_array_type::value_type &num_forward_elements, // num_vertices const typename forward_array_type::value_type @@ -675,11 +683,13 @@ void create_reverse_map( const lno_t MINIMUM_TO_ATOMIC = 64; - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = - reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1); + reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), + num_reverse_elements + 1); reverse_map_adj = reverse_array_type( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "REVERSE_ADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "REVERSE_ADJ"), num_forward_elements); if (num_reverse_elements < MINIMUM_TO_ATOMIC) { @@ -693,58 +703,82 @@ void create_reverse_map( const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale; - reverse_array_type tmp_color_xadj("TMP_REVERSE_XADJ", tmp_reverse_size + 1); + reverse_array_type tmp_color_xadj( + Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), + tmp_reverse_size + 1); Reverse_Map_Scale_Init rmi( forward_map, tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); inclusive_parallel_prefix_sum( - tmp_reverse_size + 1, tmp_color_xadj); - MyExecSpace().fence(); + my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); + my_exec_space.fence(); - Kokkos::parallel_for("KokkosKernels::Common::StridedCopy", - my_exec_space(0, num_reverse_elements + 1), - StridedCopy( - tmp_color_xadj, reverse_map_xadj, scale_size)); - MyExecSpace().fence(); + Kokkos::parallel_for( + "KokkosKernels::Common::StridedCopy", + range_policy_t(my_exec_space, 0, num_reverse_elements + 1), + StridedCopy( + tmp_color_xadj, reverse_map_xadj, scale_size)); + my_exec_space.fence(); Fill_Reverse_Scale_Map frm( forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } else // atomic implementation. { reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "TMP_REVERSE_XADJ"), num_reverse_elements + 1); Reverse_Map_Init rmi( forward_map, reverse_map_xadj); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); // print_1Dview(reverse_map_xadj); inclusive_parallel_prefix_sum( - num_reverse_elements + 1, reverse_map_xadj); - MyExecSpace().fence(); - Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj); - MyExecSpace().fence(); + my_exec_space, num_reverse_elements + 1, reverse_map_xadj); + Kokkos::deep_copy(my_exec_space, tmp_color_xadj, reverse_map_xadj); + my_exec_space.fence(); Fill_Reverse_Map frm( forward_map, tmp_color_xadj, reverse_map_adj); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } } +template +void create_reverse_map( + const typename reverse_array_type::value_type + &num_forward_elements, // num_vertices + const typename forward_array_type::value_type + &num_reverse_elements, // num_colors + + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { + MyExecSpace my_exec_space; + return create_reverse_map(my_exec_space, num_forward_elements, + num_reverse_elements, forward_map, reverse_map_xadj, + reverse_map_adj); +} + template struct PermuteVector { @@ -772,18 +806,30 @@ struct PermuteVector { template -void permute_vector(typename idx_array_type::value_type num_elements, +void permute_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteVector( old_vector, new_vector, old_to_new_index_map)); } +template +void permute_vector(typename idx_array_type::value_type num_elements, + idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, + new_vector); +} + template struct PermuteBlockVector { @@ -817,27 +863,51 @@ struct PermuteBlockVector { template -void permute_block_vector(typename idx_array_type::value_type num_elements, +void permute_block_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, int block_size, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; - + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteBlockVector(block_size, old_vector, new_vector, old_to_new_index_map)); } +template +void permute_block_vector(typename idx_array_type::value_type num_elements, + int block_size, idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_block_vector(MyExecSpace(), num_elements, block_size, + old_to_new_index_map, old_vector, new_vector); +} + // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. +template +void zero_vector(ExecSpaceIn &exec_space_in, + typename value_array_type::value_type /* num_elements */, + value_array_type &vector) { + typedef typename value_array_type::non_const_value_type val_type; + Kokkos::deep_copy(exec_space_in, vector, + Kokkos::ArithTraits::zero()); + exec_space_in.fence(); +} + template void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { - typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(vector, Kokkos::ArithTraits::zero()); + using ne_tmp_t = typename value_array_type::value_type; + ne_tmp_t ne_tmp = ne_tmp_t(0); + MyExecSpace my_exec_space; + zero_vector(my_exec_space, ne_tmp, + vector); } template @@ -1250,17 +1320,30 @@ struct ReduceRowSizeFunctor { // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(const size_t num_rows, +void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, + const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, size_type &max_row_size) { - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0, num_rows), + "KokkosKernels::Common::ViewReduceMaxRowSize", + range_policy_t(my_exec_space, 0, num_rows), ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } +// view has num_rows+1 elements. +template +void kk_view_reduce_max_row_size(const size_t num_rows, + const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, + size_type &max_row_size) { + return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, + rowmap_view_begins, rowmap_view_ends, + max_row_size); +} + template struct ReduceMaxRowFunctor { view_type rowmap_view; diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 672bdf3fbb..1da965a082 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -25,7 +25,8 @@ using default_lno_t = int; #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) using default_lno_t = int64_t; #else -using default_lno_t = int; +// Non-ETI build: default to int +using default_lno_t = int; #endif // Prefer int as the default offset type, because cuSPARSE doesn't support // size_t for rowptrs. @@ -34,6 +35,7 @@ using default_size_type = int; #elif defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) using default_size_type = size_t; #else +// Non-ETI build: default to int using default_size_type = int; #endif @@ -60,7 +62,7 @@ using default_scalar = double; #if defined(KOKKOS_ENABLE_CUDA) using default_device = Kokkos::Cuda; #elif defined(KOKKOS_ENABLE_HIP) -using default_device = Kokkos::Experimental::HIP; +using default_device = Kokkos::HIP; #elif defined(KOKKOS_ENABLE_OPENMPTARGET) using default_device = Kokkos::Experimental::OpenMPTarget; #elif defined(KOKKOS_ENABLE_OPENMP) diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 31744f7a8f..75c0951e10 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -25,7 +25,9 @@ #include #include #include +#if KOKKOS_VERSION < 40199 #include +#endif #include @@ -197,8 +199,6 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, namespace Kokkos { // Macro to automate the wrapping of Kokkos Mathematical Functions -// in the ArithTraits struct for real floating point types, hopefully -// this can be expanded to Kokkos::half_t and Kokkos::bhalf_t #define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ static FUNC_QUAL val_type zero() { return static_cast(0); } \ static FUNC_QUAL val_type one() { return static_cast(1); } \ @@ -279,6 +279,83 @@ namespace Kokkos { static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } +// Macro to automate the wrapping of Kokkos Mathematical Functions +#define KOKKOSKERNELS_ARITHTRAITS_HALF_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static FUNC_QUAL val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static FUNC_QUAL val_type infinity() { \ + return Kokkos::Experimental::infinity::value; \ + } \ + static FUNC_QUAL val_type nan() { \ + return Kokkos::Experimental::quiet_NaN::value; \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return Kokkos::Experimental::epsilon::value; \ + } \ + static FUNC_QUAL mag_type sfmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int base() { \ + return Kokkos::Experimental::radix::value; \ + } \ + static FUNC_QUAL mag_type prec() { \ + return epsilon() * static_cast(base()); \ + } \ + static FUNC_QUAL int t() { \ + return Kokkos::Experimental::digits::value; \ + } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { \ + return Kokkos::Experimental::min_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int emax() { \ + return Kokkos::Experimental::max_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmax() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ + static FUNC_QUAL mag_type eps() { return epsilon(); } + #define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ \ static constexpr bool is_specialized = true; \ @@ -912,8 +989,6 @@ class ArithTraits { //@} }; -// Since Kokkos::Experimental::half_t falls back to float, only define -// ArithTraits if half_t is a backend specialization #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT template <> class ArithTraits { @@ -926,8 +1001,9 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; - static constexpr bool has_infinity = true; +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half( Kokkos::Experimental::infinity::value); @@ -1028,16 +1104,21 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type epsilon() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); } +#endif + // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - // C++ doesn't have a standard "half-float" type. - using halfPrecision = val_type; - using doublePrecision = double; + using magnitudeType = mag_type; + using halfPrecision = Kokkos::Experimental::half_t; + using doublePrecision = float; + + static std::string name() { return "half_t"; } static constexpr bool isComplex = false; static constexpr bool isOrdinal = false; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; + +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -1047,7 +1128,6 @@ class ArithTraits { static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "half"; } static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } @@ -1077,8 +1157,15 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } +#else +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) +#else + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) +#endif +#endif }; -#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF +#endif // #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT // Since Kokkos::Experimental::bhalf_t falls back to float, only define // ArithTraits if bhalf_t is a backend specialization @@ -1094,8 +1181,9 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; - static constexpr bool has_infinity = true; +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_bhalf( Kokkos::Experimental::infinity::value); @@ -1193,16 +1281,23 @@ class ArithTraits { // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); } +#endif + // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - // C++ doesn't have a standard "bhalf-float" type. - using bhalfPrecision = val_type; - using doublePrecision = double; + using magnitudeType = mag_type; + using bhalfPrecision = Kokkos::Experimental::bhalf_t; + // There is no type that has twice the precision as bhalf_t. + // The closest type would be float. + using doublePrecision = void; static constexpr bool isComplex = false; static constexpr bool isOrdinal = false; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; + + static std::string name() { return "bhalf_t"; } + +#if KOKKOS_VERSION < 40199 static KOKKOS_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -1212,7 +1307,6 @@ class ArithTraits { static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "bhalf"; } static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } @@ -1242,8 +1336,15 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } +#else +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ARITHTRAITS_HALF_FP(KOKKOS_FUNCTION) +#else + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) +#endif +#endif }; -#endif // KOKKOS_BHALF_T_IS_FLOAT +#endif // #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT template <> class ArithTraits { diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 8aa963b2ab..1d9a4c6480 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -35,18 +35,31 @@ #include // typeid (T) #include +#if KOKKOS_VERSION < 40199 #define FAILURE() \ { \ KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Failure\n", __FILE__, __func__, \ __LINE__); \ success = 0; \ } +#else +#define FAILURE() \ + { \ + Kokkos::printf("%s:%s:%d: Failure\n", __FILE__, __func__, __LINE__); \ + success = 0; \ + } +#endif #if 0 +#if KOKKOS_VERSION < 40199 #define TRACE() \ KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%s:%d: Trace\n", __FILE__, __func__, \ __LINE__); #else +#define TRACE() \ + Kokkos::printf("%s:%s:%d: Trace\n", __FILE__, __func__, __LINE__); +#endif +#else #define TRACE() #endif @@ -119,7 +132,7 @@ struct HasTranscendentals { template class ArithTraitsTesterBase { public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -181,7 +194,11 @@ class ArithTraitsTesterBase { // T, but we check for this int constant for compatibility with // std::numeric_limits. if (!AT::is_specialized) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT::is_specialized\n"); +#else + Kokkos::printf("! AT::is_specialized\n"); +#endif FAILURE(); } @@ -189,13 +206,21 @@ class ArithTraitsTesterBase { // function, just not to its class methods (which are not marked // as device functions). if (AT::is_integer != std::numeric_limits::is_integer) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_integer not same as numeric_limits\n"); +#else + Kokkos::printf("AT::is_integer not same as numeric_limits\n"); +#endif FAILURE(); } if (AT::is_exact != std::numeric_limits::is_exact) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_exact not same as numeric_limits\n"); +#else + Kokkos::printf("AT::is_exact not same as numeric_limits\n"); +#endif FAILURE(); } @@ -204,34 +229,62 @@ class ArithTraitsTesterBase { // Test properties of the arithmetic and multiplicative identities. if (zero + zero != zero) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 + 0 != 0\n"); +#else + Kokkos::printf("0 + 0 != 0\n"); +#endif FAILURE(); } if (zero + one != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 + 1 != 1\n"); +#else + Kokkos::printf("0 + 1 != 1\n"); +#endif FAILURE(); } if (one - one != zero) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 - 1 != 0\n"); +#else + Kokkos::printf("1 - 1 != 0\n"); +#endif FAILURE(); } // This is technically 1 even of Z_2, since in that field, one // is its own inverse (so -one == one). if ((one + one) - one != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("(1 + 1) - 1 != 1\n"); +#else + Kokkos::printf("(1 + 1) - 1 != 1\n"); +#endif FAILURE(); } if (AT::abs(zero) != zero) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) != 0\n"); +#else + Kokkos::printf("AT::abs(0) != 0\n"); +#endif FAILURE(); } if (AT::abs(one) != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(1) != 1\n"); +#else + Kokkos::printf("AT::abs(1) != 1\n"); +#endif FAILURE(); } if (AT::is_signed && AT::abs(-one) != one) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_signed and AT::abs(-1) != 1\n"); +#else + Kokkos::printf("AT::is_signed and AT::abs(-1) != 1\n"); +#endif FAILURE(); } // Need enable_if to test whether T can be compared using <=. @@ -240,7 +293,11 @@ class ArithTraitsTesterBase { // These are very mild ordering properties. // They should work even for a set only containing zero. if (AT::abs(zero) > AT::abs(AT::max())) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::abs(0) > AT::abs (AT::max ())\n"); +#else + Kokkos::printf("AT::abs(0) > AT::abs (AT::max ())\n"); +#endif FAILURE(); } @@ -356,9 +413,20 @@ class ArithTraitsTesterBase { } if (AT::has_infinity) { - if (!AT::isInf(AT::infinity())) { - out << "AT::isInf (inf) != true" << endl; - FAILURE(); +// Compiler intrinsic casts from inf of type half_t / bhalf_t to inf +// of type float in CUDA, SYCL and HIP do not work yet. +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_HIP) + namespace KE = Kokkos::Experimental; + if constexpr (!std::is_same::value && + !std::is_same::value) { +#else + { +#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP + if (!AT::isInf(AT::infinity())) { + out << "AT::isInf (inf) != true" << endl; + FAILURE(); + } } } if (!std::is_same::value) { @@ -430,7 +498,7 @@ class ArithTraitsTesterTranscendentalBase typedef ArithTraitsTesterBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -509,7 +577,7 @@ class ArithTraitsTesterTranscendentalBase } public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -553,20 +621,36 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(two, three); if (!equal(result, eight)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(2,3) != 8\n"); +#else + Kokkos::printf("AT::pow(2,3) != 8\n"); +#endif FAILURE(); } } if (!equal(AT::pow(three, zero), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,0) != 1\n"); +#else + Kokkos::printf("AT::pow(3,0) != 1\n"); +#endif FAILURE(); } if (!equal(AT::pow(three, one), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,1) != 3\n"); +#else + Kokkos::printf("AT::pow(3,1) != 3\n"); +#endif FAILURE(); } if (!equal(AT::pow(three, two), nine)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,2) != 9\n"); +#else + Kokkos::printf("AT::pow(3,2) != 9\n"); +#endif FAILURE(); } @@ -574,7 +658,11 @@ class ArithTraitsTesterTranscendentalBase if (!AT::is_complex) { result = AT::pow(three, three); if (!equal(result, twentySeven)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(3,3) != 27\n"); +#else + Kokkos::printf("AT::pow(3,3) != 27\n"); +#endif FAILURE(); } } @@ -583,93 +671,170 @@ class ArithTraitsTesterTranscendentalBase if (AT::is_signed && !AT::is_complex) { result = AT::pow(-three, one); if (!equal(result, -three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,1) != -3\n"); +#else + Kokkos::printf("AT::pow(-3,1) != -3\n"); +#endif FAILURE(); } result = AT::pow(-three, two); if (!equal(result, nine)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,2) != 9\n"); +#else + Kokkos::printf("AT::pow(-3,2) != 9\n"); +#endif FAILURE(); } result = AT::pow(-three, three); if (!equal(result, -twentySeven)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::pow(-3,3) != 27\n"); +#else + Kokkos::printf("AT::pow(-3,3) != 27\n"); +#endif FAILURE(); } } if (!equal(AT::sqrt(zero), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(0) != 0\n"); +#else + Kokkos::printf("AT::sqrt(0) != 0\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(one), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(1) != 1\n"); +#else + Kokkos::printf("AT::sqrt(1) != 1\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(thirtySix), six)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(36) != 6\n"); +#else + Kokkos::printf("AT::sqrt(36) != 6\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(sixtyFour), eight)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(64) != 8\n"); +#else + Kokkos::printf("AT::sqrt(64) != 8\n"); +#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::sqrt(fortyTwo), six)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:sqrt(42) != 6\n"); +#else + Kokkos::printf("AT:sqrt(42) != 6\n"); +#endif FAILURE(); } if (!equal(AT::sqrt(oneTwentySeven), eleven)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::sqrt(127) != 11\n"); +#else + Kokkos::printf("AT::sqrt(127) != 11\n"); +#endif FAILURE(); } } if (!equal(AT::cbrt(zero), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n"); +#else + Kokkos::printf("AT::cbrt(0) != 0\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(one), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n"); +#else + Kokkos::printf("AT::cbrt(1) != 1\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(twentySeven), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n"); +#else + Kokkos::printf("AT::cbrt(27) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(sixtyFour), four)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n"); +#else + Kokkos::printf("AT::cbrt(64) != 4\n"); +#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt(fortyTwo), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n"); +#else + Kokkos::printf("AT:cbrt(42) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(oneTwentySeven), five)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n"); +#else + Kokkos::printf("AT::cbrt(127) != 5\n"); +#endif FAILURE(); } } if (!equal(AT::exp(zero), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n"); +#else + Kokkos::printf("AT::cbrt(0) != 1\n"); +#endif FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); +#else + Kokkos::printf( + "AT::conj(exp(complex(2,2))) != AT::exp(conj(complex(2,2)))\n"); +#endif FAILURE(); } } if (!equal(AT::log(one), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log(1) != 0\n"); +#else + Kokkos::printf("AT::log(1) != 0\n"); +#endif FAILURE(); } if (!equal(AT::log10(one), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::log10(1) != 0\n"); +#else + Kokkos::printf("AT::log10(1) != 0\n"); +#endif FAILURE(); } @@ -678,13 +843,23 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } else { @@ -692,27 +867,47 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#else + Kokkos::printf("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } if (!equal(AT::asin(AT::sin(one)), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(1)) != 1\n"); +#else + Kokkos::printf("AT::asin(sin(1)) != 1\n"); +#endif FAILURE(); } if (!equal(AT::acos(AT::cos(one)), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(1)) != 1\n"); +#else + Kokkos::printf("AT::acos(cos(1)) != 1\n"); +#endif FAILURE(); } if (!equal(AT::atan(AT::tan(one)), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(1)) != 1\n"); +#else + Kokkos::printf("AT::atan(tan(1)) != 1\n"); +#endif FAILURE(); } @@ -839,41 +1034,74 @@ class ArithTraitsTesterTranscendentalBase } if (!equal(AT::cbrt(zero), zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 0\n"); +#else + Kokkos::printf("AT::cbrt(0) != 0\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(one), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(1) != 1\n"); +#else + Kokkos::printf("AT::cbrt(1) != 1\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(twentySeven), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(27) != 3\n"); +#else + Kokkos::printf("AT::cbrt(27) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(sixtyFour), four)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(64) != 4\n"); +#else + Kokkos::printf("AT::cbrt(64) != 4\n"); +#endif FAILURE(); } if (AT::is_integer) { if (!equal(AT::cbrt(fortyTwo), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT:cbrt(42) != 3\n"); +#else + Kokkos::printf("AT:cbrt(42) != 3\n"); +#endif FAILURE(); } if (!equal(AT::cbrt(oneTwentySeven), five)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(127) != 5\n"); +#else + Kokkos::printf("AT::cbrt(127) != 5\n"); +#endif FAILURE(); } } if (!equal(AT::exp(zero), one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::cbrt(0) != 1\n"); +#else + Kokkos::printf("AT::cbrt(0) != 1\n"); +#endif FAILURE(); } if (AT::is_complex) { const ScalarType val = two; //(two.real(), two.real()); if (!equal(AT::conj(AT::exp(val)), AT::exp(AT::conj(val)))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); +#else + Kokkos::printf( + "AT::conj(exp(complex(2,0))) != AT::exp(conj(complex(2,0)))\n"); +#endif FAILURE(); } } @@ -891,13 +1119,23 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)*sin(val) + cos(val)*cos(val) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf( + "AT(complex):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } else { @@ -905,27 +1143,47 @@ class ArithTraitsTesterTranscendentalBase const auto val_sin = AT::sin(val); const auto val_cos = AT::cos(val); if (!equal(val_sin * val_sin + val_cos * val_cos, one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#else + Kokkos::printf("AT(real):: sin(val)*sin(val) + cos(a)*cos(a) != 1\n"); +#endif FAILURE(); } if (!equal(val_sin / val_cos, AT::tan(val))) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#else + Kokkos::printf("AT(real):: sin(val)/cos(val) != AT(real)::tan(val)\n"); +#endif FAILURE(); } } if (!equal(AT::asin(AT::sin(three)), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::asin(sin(3)) != 3\n"); +#else + Kokkos::printf("AT::asin(sin(3)) != 3\n"); +#endif FAILURE(); } if (!equal(AT::acos(AT::cos(three)), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::acos(cos(3)) != 3\n"); +#else + Kokkos::printf("AT::acos(cos(3)) != 3\n"); +#endif FAILURE(); } if (!equal(AT::atan(AT::tan(three)), three)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::atan(tan(3)) != 3\n"); +#else + Kokkos::printf("AT::atan(tan(3)) != 3\n"); +#endif FAILURE(); } @@ -993,7 +1251,7 @@ class ArithTraitsTesterComplexBase typedef ArithTraitsTesterTranscendentalBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1017,10 +1275,17 @@ class ArithTraitsTesterComplexBase #else { if (AT::is_signed != std::numeric_limits::is_signed) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "AT::is_signed = 0x%x, std::numeric_limits::is_signed " "= 0x%x\n", AT::is_signed, std::numeric_limits::is_signed); +#else + Kokkos::printf( + "AT::is_signed = 0x%x, std::numeric_limits::is_signed " + "= 0x%x\n", + AT::is_signed, std::numeric_limits::is_signed); +#endif FAILURE(); } } @@ -1079,7 +1344,7 @@ class ArithTraitsTesterComplexBase typedef ArithTraitsTesterTranscendentalBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1217,7 +1482,7 @@ class ArithTraitsTesterFloatingPointBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1233,34 +1498,91 @@ class ArithTraitsTesterFloatingPointBase int success = 1; if (AT::is_exact) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("AT::is_exact is 1\n"); +#else + Kokkos::printf("AT::is_exact is 1\n"); +#endif FAILURE(); } - if (!AT::isNan(AT::nan())) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n"); - FAILURE(); +// Compiler intrinsic casts from nan of type half_t / bhalf_t to nan +// of type float in CUDA, SYCL and HIP do not work yet. +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_HIP) + namespace KE = Kokkos::Experimental; + if constexpr (!std::is_same::value && + !std::is_same::value) { +#else + { +#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_SYCL || KOKKOS_ENABLE_HIP + if (!AT::isNan(AT::nan())) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("NaN is not NaN\n"); +#else + Kokkos::printf("NaN is not NaN\n"); +#endif + FAILURE(); + } } const ScalarType zero = AT::zero(); const ScalarType one = AT::one(); if (AT::isInf(zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is Inf\n"); +#else + Kokkos::printf("0 is Inf\n"); +#endif FAILURE(); } if (AT::isInf(one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is Inf\n"); +#else + Kokkos::printf("1 is Inf\n"); +#endif FAILURE(); } +#if defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_HIP) // FIXME_SYCL, FIXME_HIP + if constexpr (!std::is_same_v) { + if (AT::isNan(zero)) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); +#else + Kokkos::printf("0 is NaN\n"); +#endif + FAILURE(); + } + if (AT::isNan(one)) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); +#else + Kokkos::printf("1 is NaN\n"); +#endif + FAILURE(); + } + } +#else if (AT::isNan(zero)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("0 is NaN\n"); +#else + Kokkos::printf("0 is NaN\n"); +#endif FAILURE(); } if (AT::isNan(one)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("1 is NaN\n"); +#else + Kokkos::printf("1 is NaN\n"); +#endif FAILURE(); } +#endif // Call the base class' implementation. Every subclass' // implementation of operator() must do this, in order to include @@ -1285,10 +1607,19 @@ class ArithTraitsTesterFloatingPointBase // if (std::numeric_limits::is_iec559) { // success = success && AT::isInf (AT::inf ()); +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + if constexpr (!std::is_same_v) { + if (!AT::isNan(AT::nan())) { + out << "isNan or nan failed" << endl; + FAILURE(); + } + } +#else if (!AT::isNan(AT::nan())) { out << "isNan or nan failed" << endl; FAILURE(); } +#endif //} const ScalarType zero = AT::zero(); @@ -1302,6 +1633,18 @@ class ArithTraitsTesterFloatingPointBase out << "isInf(one) is 1" << endl; FAILURE(); } +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_HIP) + if constexpr (!std::is_same_v) { + if (AT::isNan(zero)) { + out << "isNan(zero) is 1" << endl; + FAILURE(); + } + if (AT::isNan(one)) { + out << "isNan(one) is 1" << endl; + FAILURE(); + } + } +#else if (AT::isNan(zero)) { out << "isNan(zero) is 1" << endl; FAILURE(); @@ -1310,6 +1653,7 @@ class ArithTraitsTesterFloatingPointBase out << "isNan(one) is 1" << endl; FAILURE(); } +#endif // Call the base class' implementation. Every subclass' // implementation of testHostImpl() should (must) do this, in @@ -1336,7 +1680,7 @@ class ArithTraitsTesterFloatingPointBase base_type; public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1352,7 +1696,11 @@ class ArithTraitsTesterFloatingPointBase int success = 1; if (!AT::is_exact) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("! AT:is_exact\n"); +#else + Kokkos::printf("! AT:is_exact\n"); +#endif FAILURE(); } @@ -1417,7 +1765,7 @@ template class ArithTraitsTester : public ArithTraitsTesterFloatingPointBase { public: - typedef DeviceType execution_space; + typedef typename DeviceType::execution_space execution_space; typedef typename execution_space::size_type size_type; //! Type of the result of the reduction. typedef int value_type; @@ -1703,6 +2051,6 @@ void test_ArithTraits() { success = runAllArithTraitsHostTests(out, 0); EXPECT_TRUE(success); } -TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits(); } +TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits(); } #endif // KOKKOS_ARITHTRAITSTEST_HPP diff --git a/common/unit_test/Test_Common_IOUtils.hpp b/common/unit_test/Test_Common_IOUtils.hpp index c4e031adf4..1219304421 100644 --- a/common/unit_test/Test_Common_IOUtils.hpp +++ b/common/unit_test/Test_Common_IOUtils.hpp @@ -70,6 +70,6 @@ void testPrintView() { "[2x2 multi-vector]\n"); } -TEST_F(TestCategory, common_print_view) { testPrintView(); } +TEST_F(TestCategory, common_print_view) { testPrintView(); } #endif // KOKKOSKERNELS_IOTEST_HPP diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp index cae207d56b..af3b6502bf 100644 --- a/common/unit_test/Test_Common_Iota.hpp +++ b/common/unit_test/Test_Common_Iota.hpp @@ -74,6 +74,17 @@ void test_iota_rank() { EXPECT_EQ((Iota::rank), 1); } +template +void test_iota_non_const_value_type() { + static_assert( + std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const type provided"); + static_assert( + std::is_same_v::non_const_value_type, T>, + "Iota's non-const value type should be same as non-const version of " + "const type provided"); +} + template void test_iota_subview() { // get the 7th and 8th elements of an Iota @@ -98,6 +109,7 @@ void test_iota() { test_is_iota(); test_iota_constructor(); test_iota_rank(); + test_iota_non_const_value_type(); test_iota_subview(); } diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index 3ff27da23c..6ca28b8be1 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -43,9 +43,14 @@ struct ThreadLowerBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, + __LINE__, int(i), int(expected_), int(idx)); +#endif ++lerrCount; } } @@ -100,9 +105,14 @@ struct TeamLowerBoundFunctor { hv_size_type idx = KokkosKernels::lower_bound_team(handle, haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, + int(handle.team_rank()), int(expected_), int(idx)); +#endif ++lerrCount; } } @@ -228,31 +238,31 @@ void test_lower_bound() { #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int, TestExecSpace) +EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int64_t, TestExecSpace) +EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(size_t, TestExecSpace) +EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, TestExecSpace) +EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, TestExecSpace) +EXECUTE_TEST(double, TestDevice) #endif #undef EXECUTE_TEST diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index 07a55e152b..6638c6e398 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -57,7 +57,7 @@ void testPrintConfiguration() { } TEST_F(TestCategory, common_print_configuration) { - testPrintConfiguration(); + testPrintConfiguration(); } #endif // KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index 51ecf228a8..e93a9d0939 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -47,7 +47,7 @@ size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, } Kokkos::deep_copy(randomCounts, countsHost); Kokkos::deep_copy(randomOffsets, randomCounts); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( n, randomOffsets); return total; } @@ -178,17 +178,18 @@ struct TestSerialRadix2Functor { OrdView offsets; }; -template +template void testSerialRadixSort(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); KeyView keys("Radix sort testing data", n); fillRandom(keys); // Sort using std::sort on host to do correctness test @@ -196,11 +197,11 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { Kokkos::deep_copy(gold, keys); KeyView keysAux("Radix sort aux data", n); // Run the sorting on device in all sub-arrays in parallel - typedef Kokkos::RangePolicy range_policy; + typedef Kokkos::RangePolicy range_policy; Kokkos::parallel_for( range_policy(0, k), TestSerialRadixFunctor(keys, keysAux, counts, offsets)); - ExecSpace().fence(); + exec_space().fence(); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = @@ -218,18 +219,19 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { } } -template +template void testSerialRadixSort2(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); KeyView keys("Radix test keys", n); ValView data("Radix test data", n); // The keys are randomized @@ -239,12 +241,12 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { KeyView keysAux("Radix sort aux keys", n); ValView dataAux("Radix sort aux data", n); // Run the sorting on device in all sub-arrays in parallel - typedef Kokkos::RangePolicy range_policy; + typedef Kokkos::RangePolicy range_policy; // Deliberately using a weird number for vector length Kokkos::parallel_for(range_policy(0, k), TestSerialRadix2Functor( keys, keysAux, data, dataAux, counts, offsets)); - ExecSpace().fence(); + exec_space().fence(); // Sort using std::sort on host to do correctness test auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); @@ -312,30 +314,31 @@ struct TestTeamBitonic2Functor { OrdView offsets; }; -template +template void testTeamBitonicSort(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View ValView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); ValView data("Bitonic sort testing data", n); fillRandom(data); Kokkos::View gold("Host sorted", n); Kokkos::deep_copy(gold, data); // Run the sorting on device in all sub-arrays in parallel Kokkos::parallel_for( - Kokkos::TeamPolicy(k, Kokkos::AUTO()), + Kokkos::TeamPolicy(k, Kokkos::AUTO()), TestTeamBitonicFunctor(data, counts, offsets)); // Copy result to host auto dataHost = Kokkos::create_mirror_view(data); Kokkos::deep_copy(dataHost, data); // Sort using std::sort on host to do correctness test - ExecSpace().fence(); + exec_space().fence(); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = @@ -350,18 +353,19 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) { } } -template +template void testTeamBitonicSort2(size_t k, size_t subArraySize) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; OrdView counts("Subarray Sizes", k); OrdView offsets("Subarray Offsets", k); // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, - subArraySize); + size_t n = generateRandomOffsets(counts, offsets, k, + subArraySize); KeyView keys("Bitonic test keys", n); ValView data("Bitonic test data", n); // The keys are randomized @@ -370,10 +374,10 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) { Kokkos::deep_copy(gold, keys); // Run the sorting on device in all sub-arrays in parallel, just using vector // loops Deliberately using a weird number for vector length - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), TestTeamBitonic2Functor( keys, data, counts, offsets)); - ExecSpace().fence(); + exec_space().fence(); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = @@ -409,16 +413,17 @@ struct CheckSortedFunctor { View v; }; -template +template void testBitonicSort(size_t n) { // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Kokkos::View ValView; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckSortedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); @@ -444,19 +449,20 @@ struct CompareDescending { } }; -template +template void testBitonicSortDescending() { + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef char Scalar; typedef CompareDescending Comp; // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View ValView; size_t n = 12521; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); @@ -479,18 +485,19 @@ struct LexCompare { } }; -template +template void testBitonicSortLexicographic() { + typedef typename Device::execution_space exec_space; + typedef typename Device::memory_space mem_space; typedef Coordinates Scalar; // Create a view of randomized data - typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View ValView; size_t n = 9521; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); ASSERT_TRUE(ordered); @@ -501,8 +508,8 @@ TEST_F(TestCategory, common_serial_radix) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100; for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) { - testSerialRadixSort(numArrays, arrayMax); - testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort(numArrays, arrayMax); } } @@ -511,10 +518,10 @@ TEST_F(TestCategory, common_serial_radix2) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100; for (size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) { - testSerialRadixSort2(numArrays, arrayMax); - testSerialRadixSort2(numArrays, arrayMax); - testSerialRadixSort2>(numArrays, - arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2>(numArrays, + arrayMax); } } @@ -523,8 +530,8 @@ TEST_F(TestCategory, common_team_bitonic) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 20; for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort(numArrays, arrayMax); - testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort(numArrays, arrayMax); } } @@ -533,27 +540,27 @@ TEST_F(TestCategory, common_team_bitonic2) { // 1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 20; for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, - arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2>(numArrays, + arrayMax); } } TEST_F(TestCategory, common_device_bitonic) { // Test device-level bitonic with some larger arrays - testBitonicSort(243743); - testBitonicSort(2157); - testBitonicSort(424); - testBitonicSort(5); - testBitonicSort(92314); - testBitonicSort(123); - testBitonicSort(60234); - testBitonicSort(53); + testBitonicSort(243743); + testBitonicSort(2157); + testBitonicSort(424); + testBitonicSort(5); + testBitonicSort(92314); + testBitonicSort(123); + testBitonicSort(60234); + testBitonicSort(53); // Test custom comparator: ">" instead of "<" to sort descending - testBitonicSortDescending(); + testBitonicSortDescending(); // Test custom comparator: lexicographic comparison of 3-element struct - testBitonicSortLexicographic(); + testBitonicSortLexicographic(); } #endif diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index a6d3b24d84..113b76c3ad 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -43,9 +43,14 @@ struct ThreadUpperBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(i), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, + __LINE__, int(i), int(expected_), int(idx)); +#endif ++lerrCount; } } @@ -100,9 +105,14 @@ struct TeamUpperBoundFunctor { hv_size_type idx = KokkosKernels::upper_bound_team(handle, haystack_, needle_); if (idx != expected_) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, int(handle.team_rank()), int(expected_), int(idx)); +#else + Kokkos::printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, + int(handle.team_rank()), int(expected_), int(idx)); +#endif ++lerrCount; } } @@ -219,31 +229,31 @@ void test_upper_bound() { #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int, TestExecSpace) +EXECUTE_TEST(int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(int64_t, TestExecSpace) +EXECUTE_TEST(int64_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(size_t, TestExecSpace) +EXECUTE_TEST(size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, TestExecSpace) +EXECUTE_TEST(float, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, TestExecSpace) +EXECUTE_TEST(double, TestDevice) #endif #undef EXECUTE_TEST diff --git a/common/unit_test/Test_Common_set_bit_count.hpp b/common/unit_test/Test_Common_set_bit_count.hpp index dd65ced821..6e2c6e80b6 100644 --- a/common/unit_test/Test_Common_set_bit_count.hpp +++ b/common/unit_test/Test_Common_set_bit_count.hpp @@ -218,35 +218,35 @@ void test_ffs() { } TEST_F(TestCategory, common_set_bit_count) { - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); - test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); + test_set_bit_count(); } TEST_F(TestCategory, common_ffs) { - test_ffs(); - test_ffs(); - test_ffs(); - test_ffs(); - - test_ffs(); - test_ffs(); - test_ffs(); - test_ffs(); - - test_ffs(); - test_ffs(); - test_ffs(); - test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); + + test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); + + test_ffs(); + test_ffs(); + test_ffs(); + test_ffs(); } diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 954b6b669b..2b23557a81 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -1186,7 +1186,7 @@ IGNORE_PREFIX = # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. -GENERATE_HTML = NO +GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1643,7 +1643,7 @@ FORMULA_MACROFILE = # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -USE_MATHJAX = NO +USE_MATHJAX = YES # With MATHJAX_VERSION it is possible to specify the MathJax version to be used. # Note that the different versions of MathJax have different requirements with diff --git a/docs/_static/table_theme.css b/docs/_static/table_theme.css new file mode 100644 index 0000000000..7271d0e2fd --- /dev/null +++ b/docs/_static/table_theme.css @@ -0,0 +1,5 @@ +.wy-nav-content { + height: 100%; + max-width: 100% !important; + margin: auto; +} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index f7027880c5..cfed3629aa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,4 +79,6 @@ def configureDoxyfile(input_dir, output_dir, doxyfile_in, doxyfile_out): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] +html_static_path = ['_static'] + +html_css_files = ['table_theme.css'] diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index 20dbc5ea9a..434e9caf03 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -3,10 +3,15 @@ BLAS2 -- KokkosKernels blas2 interfaces gemv ---- +.. doxygenfunction:: KokkosBlas::gemv(const ExecutionSpace &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) .. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) -.. doxygenfunction:: KokkosBlas::gemv(const execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) ger ---- .. doxygenfunction:: KokkosBlas::ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) .. doxygenfunction:: KokkosBlas::ger(const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) + +syr +---- +.. doxygenfunction:: KokkosBlas::syr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) +.. doxygenfunction:: KokkosBlas::syr(const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index f73b507439..415f72eec8 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -38,11 +38,11 @@ crs2coo spmv ---- -.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) -.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) +.. doxygenfunction:: KokkosSparse::spmv(const ExecutionSpace& space, KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(const ExecutionSpace& space, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) .. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) - trsv ---- .. doxygenfunction:: KokkosSparse::trsv @@ -60,11 +60,20 @@ block_spgemm gauss_seidel ------------ +.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) +.. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/docs/developer/index.rst b/docs/developer/index.rst index 58f89084ac..c95a3ee63b 100644 --- a/docs/developer/index.rst +++ b/docs/developer/index.rst @@ -9,4 +9,4 @@ Developer Manual Code Style Guide Contributing Profiling - \ No newline at end of file + Testing Table \ No newline at end of file diff --git a/docs/developer/testing_table.rst b/docs/developer/testing_table.rst new file mode 100644 index 0000000000..a2ec29adf8 --- /dev/null +++ b/docs/developer/testing_table.rst @@ -0,0 +1,527 @@ +Testing Table +============= + +SAND2023-05267O [#]_ + +Below is a testing table summarizing the KokkosKernels continuous integration and nightly test coverage. + +The following is a description of abbreviations used throughout the testing table. + +* ETI: Explicit template instantiation +* PR: Pull Request +* LEFT: LayoutLeft +* RIGHT: LayoutRight +* REL: CMake release build type +* DBG: CMake debug build type +* BCHK: Kokkos core bounds checking +* NOETI: No default ETI types included +* UVM: Unified Memory (Cuda) + +The following is a description of column headings in the testing table. + +* Project: the jenkins project name for the test case +* Architectures: the test case's coverage architectures +* Compilers: the covered compilers +* Backends: the covered kokkos core backends +* Scalars: the covered ETI'd scalar types +* Ordinals: the covered ETI'd ordinal types +* Offsets: the covered ETI'd offset types +* Layouts: the covered ETI'd kokkos core layout types + +.. list-table:: + :align: center + :header-rows: 1 + :stub-columns: 0 + :width: 100% + :widths: auto + + + * - Project + - Architectures + - Compilers + - Backends + - Scalars + - Ordinals + - Offsets + - Layouts + + * * `PR_A64FX_ARMPL2110_OPENMP_LEFT_OPENBLAS_OPENLAPACK_REL` + * A64FX + * ARMPL 21.1.10 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_A64FX_ARMPL2110_OPENMP_LEFT_OPENBLAS_OPENLAPACK_REL` + * A64FX + * ARMPL 21.1.10 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_A64FX_GCC1020_OPENMP_SERIAL_LEFT_REL` + * A64FX + * GNU 10.2.0 + * OpenMP,Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_VEGA908_ROCM520_HIP_SERIAL_LEFT_REL` + * VEGA908 + * ROCM 5.2.0 + * Hip, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PRTONIGHTLY_VEGA908_ROCM520_HIP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL` + * VEGA908 + * ROCM 5.2.0 + * Hip, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_POWER9_VOLTA70_GCC930_CLANG13_CUDA10_OPENMP_SERIAL_CUDA_LEFT_OPENBLAS_OPENLAPACK_REL` + * Power8, Pascal60 -- Power9, Volta70 + * GNU 9.3.0 -- Clang 13.0.0, Cuda 10.1.243 + * OpenMp, Serial -- Cuda + * double, `complex_double` + * int + * int, size_t + * LayoutLeft + + * * `PR_POWER9_VOLTA70_CUDA11_OPENMP_CUDA_LEFT_RIGHT_REL` + * Power9, Volta70 + * GNU 8.3.1, Cuda 11.2.2 + * Cuda, OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_SKX_GNU1020_OPENMP_LEFT_REL_NOETI` + * Skx + * GNU 10.2.0 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_SKX_GNU1020_THREADS_SERIAL_RIGHT_REL` + * Skx + * GNU 10.2.0 + * Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutRight + + * * `PR_SKX_GNU1020_OPENMP_SERIAL_LEFT_OPENBLAS_OPENLAPACK_REL` + * Skx + * GNU 10.2.0 + * Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_SKX_INTEL19_OPENMP_LEFT_MKLBLAS_MKLLAPACK_REL` + * Skx + * Intel 19.5.281 + * OpenMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_SKX_CLANG1001_THREADS_SERIAL_LEFT_REL` + * Skx + * Clang 10.0.1 + * Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `PR_NONE_CLANG14001_SERIAL_LEFT_RIGHT_RELWITHDBG_BCHK` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_NONE_CLANG14001_THREADS_LEFT_RIGHT_RELWITHDBG_BCHK` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_NONE_CLANG14001_SERIAL_LEFT_RIGHT_DBG` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `PR_NONE_CLANG14001_SERIAL_LEFT_RIGHT_REL_BCHK` + * NONE + * Clang 14.0.0.14000029 + * Serial + * double, float, `complex_double`, `complex_float` + * int + * int, `size_t` + * LayoutLeft, LayoutRight + + * * `NIGHTLY_SKX_GNU1020_OPENMP_THREADS_SERIAL_LEFT_DBG` + * SKX + * GNU 10.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_GNU820_OPENMP_THREADS_SERIAL_LEFT_DBG` + * SKX + * GNU 8.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_GNU820_OPENMP_THREADS_SERIAL_LEFT_REL` + * SKX + * GNU 8.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_GNU920_OPENMP_THREADS_SERIAL_LEFT_DBG` + * SKX + * GNU 9.2.0 + * OpenMp, Threads, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_OPENMP_LEFT_DBG` + * SKX + * Intel 19.0.5 + * OpenMp + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_SERIAL_LEFT_DBG` + * SKX + * Intel 19.0.5 + * Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_THREADS_LEFT_DBG` + * SKX + * Intel 19.0.5 + * Threads + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_SKX_INTEL19_OPENMP_LEFT_MKL_DBG` + * SKX + * Intel 19.0.5 + * OPENMP + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_OPENMP_CUDA_LEFT_REL` + * SKX + * Cuda 11.2.2 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_REL` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_REL_UVM_RDC` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_DBG_BCHK` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_POWER9_VOLTA70_CUDA11_SERIAL_CUDA_LEFT_CUBLAS_CUSPARSE_REL_BCHK` + * SKX + * Cuda 11.2.2 + * Serial, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA908_ROCM520_SERIAL_HIP_LEFT_REL` + * VEGA908 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA908_ROCM520_SERIAL_HIP_LEFT_ROCBLAS_ROCSPARSE_REL` + * VEGA908 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA906_ROCM520_SERIAL_HIP_LEFT_REL` + * VEGA906 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VEGA906_ROCM520_SERIAL_HIP_LEFT_DBG_BCHK` + * VEGA906 + * Rocm 5.2.0 + * Serial, Hip + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_AMPERE80_CUDA11_SERIAL_CUDA_LEFT_DBG` + * AMPHERE80 + * Cuda 11.7.99 + * Serial, Cuda + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG10_SERIAL_OPENMP_THREADS_LEFT_REL` + * Volta70 + * Clang 10.0.0 + * Serial, OpenMP, Threads + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA10_CUDA_SERIAL_LEFT_RELWITHDBG` + * Volta70 + * Cuda 10.1 + * Serial, Cuda + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA117_CUDA_SERIAL_LEFT_RELWITHDBG` + * Volta70 + * Cuda 11.7 + * Serial, Cuda + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG900_SERIAL_THREADS_LEFT_REL` + * Volta70 + * Clang 9.0.0 + * Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG900_SERIAL_THREADS_LEFT_DBG` + * Volta70 + * Clang 9.0.0 + * Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CLANG900_SERIAL_THREADS_LEFT_REL_CPP20` + * Volta70 + * Clang 9.0.0 + * Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA110_CUDA_OPENMP_LEFT_REL` + * Volta70 + * Cuda 11.0 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA120_CUDA_OPENMP_LEFT_REL` + * Volta70 + * Cuda 12.0 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA120_CUDA_OPENMP_LEFT_REL` + * Volta70 + * Cuda 12.0 + * OpenMP, Cuda + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_GNU830_SERIAL_OPENMP_THREADS_LEFT_REL` + * Volta70 + * Gnu 8.3.0 + * OpenMP, `OpenMP_Serial`, Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_GNU910_GNU920_SERIAL_OPENMP_THREADS_LEFT_REL` + * Volta70 + * Gnu 9.1.0, Gnu 9.2.0 + * OpenMP, `OpenMP_Serial`, Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_GNU830_GNU910_SERIAL_OPENMP_LEFT_OPENBLAS_OPENLAPACK_REL` + * Volta70 + * Gnu 9.1.0, Gnu 9.2.0 + * OpenMP, `OpenMP_Serial`, Serial, Threads, `Threads_Serial` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_A64FX_ARMPL2030_SERIAL_OPENMP_LEFT_ARMPLLBLAS_ARMPLSLAPACK_REL` + * A64FX + * Armpl 20.3.0 + * OpenMP, Serial + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP_SERIAL_PTHREAD_LEFT_REL` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP`, `SERIAL_PTHREAD_LEFT_DBG_BCHK` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP`, `SERIAL_PTHREAD_LEFT_DBG_BCHK` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_VOLTA70_CUDA11_CUDA_OPENMP`, `SERIAL_PTHREAD_LEFT_REL_UVM` + * Volta70 + * Cuda 11.1.0 + * `Cuda_OpenMP`, `Cuda_Serial`, `Cuda_Pthread` + * double, `complex_double` + * int + * int, `size_t` + * LayoutLeft + + * * `NIGHTLY_HSW_INTEL19_OPENMP_LEFT_RELWITHDBG` + * Hsw + * Intel 19.1.3.20200925 + * OpenMP + * double + * int + * `size_t` + * LayoutLeft + + * * `NIGHTLY_KNL_INTEL19_OPENMP_LEFT_RELWITHDBG` + * Hsw + * Intel 19.1.3.20200925 + * OpenMP + * double + * int + * `size_t` + * LayoutLeft + +.. rubric:: Footnotes + +.. [#] This article has been authored by an employee of National Technology & Engineering Solutions of Sandia, LLC under Contract No. DE-NA0003525 with the U.S. Department of Energy (DOE). The employee owns all right, title and interest in and to the article and is solely responsible for its contents. The United States Government retains and the publisher, by accepting the article for publication, acknowledges that the United States Government retains a non-exclusive, paid-up, irrevocable, world-wide license to publish or reproduce the published form of this article or allow others to do so, for United States Government purposes. The DOE will provide public access to these results of federally sponsored research in accordance with the DOE Public Access Plan https://www.energy.gov/downloads/doe-public-access-plan. SAND2023-05267O. \ No newline at end of file diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 16231d64fe..238fdef187 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -17,6 +17,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" #include "KokkosKernels_default_types.hpp" +#include template struct Functor_xpy { diff --git a/graph/unit_test/Test_Graph_coarsen.hpp b/graph/unit_test/Test_Graph_coarsen.hpp index 6f0eda42f3..95f1533c88 100644 --- a/graph/unit_test/Test_Graph_coarsen.hpp +++ b/graph/unit_test/Test_Graph_coarsen.hpp @@ -422,7 +422,7 @@ void test_coarsen_random(lno_t numVerts, size_type nnz, lno_t bandwidth, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #endif @@ -430,21 +430,21 @@ EXECUTE_TEST(double, int, int, TestExecSpace) defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index 19ee9f600b..5d4eec03ca 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -96,9 +96,10 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap< lno_view_t, lno_nnz_view_t, typename lno_view_t::non_const_type, - typename lno_nnz_view_t::non_const_type, device>( - numRows, input_mat.graph.row_map, input_mat.graph.entries, sym_xadj, - sym_adj); + typename lno_nnz_view_t::non_const_type, + typename device::execution_space>(numRows, input_mat.graph.row_map, + input_mat.graph.entries, sym_xadj, + sym_adj); size_type numentries = sym_adj.extent(0); scalar_view_t newValues("vals", numentries); @@ -193,28 +194,28 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp index 3b3cadd71b..7bd3c4cd40 100644 --- a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp +++ b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp @@ -247,28 +247,28 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_graph_color_distance2.hpp b/graph/unit_test/Test_Graph_graph_color_distance2.hpp index d022a0d2d6..44ddaed0bf 100644 --- a/graph/unit_test/Test_Graph_graph_color_distance2.hpp +++ b/graph/unit_test/Test_Graph_graph_color_distance2.hpp @@ -343,28 +343,28 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #endif diff --git a/graph/unit_test/Test_Graph_mis2.hpp b/graph/unit_test/Test_Graph_mis2.hpp index faaea3b155..c6fb7562e7 100644 --- a/graph/unit_test/Test_Graph_mis2.hpp +++ b/graph/unit_test/Test_Graph_mis2.hpp @@ -274,7 +274,7 @@ void test_mis2_coarsening_zero_rows() { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #endif @@ -282,21 +282,21 @@ EXECUTE_TEST(double, int, int, TestExecSpace) defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index 861634071e..2e05554d2d 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -151,28 +151,28 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt new file mode 100644 index 0000000000..8ab784a325 --- /dev/null +++ b/lapack/CMakeLists.txt @@ -0,0 +1,67 @@ +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/impl) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/eti) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/lapack/eti) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/lapack/tpls) + +# Adding unit-tests +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/lapack) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/lapack) + +######################### +# # +# Logic for LAPACK TPLs # +# # +######################### + +#Include LAPACK, Lapack host wrapper +IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + #Do NOT add this to include path + APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/tpls/KokkosLapack_Host_tpl.cpp) +ENDIF() + +# Include host lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Host_tpl.cpp + ) +ENDIF() + +# Include cuda lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Cuda_tpl.cpp + ) +ENDIF() + +# Include rocm lapack TPL source file +IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) + LIST(APPEND SOURCES + lapack/tpls/KokkosLapack_Rocm_tpl.cpp + ) +ENDIF() + +################## +# # +# ETI generation # +# # +################## + +#Build up a list of DECL, AVAIL, and INST macros +#that should be instantiated based on input options +#Generate @X@ variables in the template X.hpp.in and X.cpp.in +#files containing the list of all needed macros + +KOKKOSKERNELS_GENERATE_ETI(Lapack_gesv gesv + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Lapack_trtri trtri + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) diff --git a/blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in similarity index 88% rename from blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in rename to lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in index 64755f7a54..da521984a4 100644 --- a/blas/eti/generated_specializations_cpp/trtri/KokkosBlas_trtri_eti_spec_inst.cpp.in +++ b/lapack/eti/generated_specializations_cpp/gesv/KokkosLapack_gesv_eti_spec_inst.cpp.in @@ -17,10 +17,10 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosBlas_trtri_spec.hpp" +#include "KokkosLapack_gesv_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -@BLAS_TRTRI_ETI_INST_BLOCK@ +@LAPACK_GESV_ETI_INST_BLOCK@ } //IMPL } //Kokkos diff --git a/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in b/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..c4ab12f5a4 --- /dev/null +++ b/lapack/eti/generated_specializations_cpp/trtri/KokkosLapack_trtri_eti_spec_inst.cpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosLapack_trtri_spec.hpp" + +namespace KokkosLapack { +namespace Impl { +@LAPACK_TRTRI_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..d1f36e3069 --- /dev/null +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_avail.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_GESV_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in similarity index 73% rename from blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in rename to lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in index 3f669efa06..89443c2c9b 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_avail.hpp.in +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_avail.hpp.in @@ -14,13 +14,13 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { namespace Impl { -@BLAS_TRTRI_ETI_AVAIL_BLOCK@ +@LAPACK_TRTRI_ETI_AVAIL_BLOCK@ } // Impl -} // KokkosBlas -#endif // KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL_HPP_ +} // KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL_HPP_ diff --git a/blas/impl/KokkosBlas_gesv_impl.hpp b/lapack/impl/KokkosLapack_gesv_impl.hpp similarity index 73% rename from blas/impl/KokkosBlas_gesv_impl.hpp rename to lapack/impl/KokkosLapack_gesv_impl.hpp index e51e48309f..3a60f42171 100644 --- a/blas/impl/KokkosBlas_gesv_impl.hpp +++ b/lapack/impl/KokkosLapack_gesv_impl.hpp @@ -14,21 +14,21 @@ // //@HEADER -#ifndef KOKKOSBLAS_IMPL_GESV_HPP_ -#define KOKKOSBLAS_IMPL_GESV_HPP_ +#ifndef KOKKOSLAPACK_IMPL_GESV_HPP_ +#define KOKKOSLAPACK_IMPL_GESV_HPP_ -/// \file KokkosBlas_gesv_impl.hpp +/// \file KokkosLapack_gesv_impl.hpp /// \brief Implementation(s) of dense linear solve. #include #include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -// NOTE: Might add the implementation of KokkosBlas::gesv later +// NOTE: Might add the implementation of KokkosLapack::gesv later } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -#endif // KOKKOSBLAS_IMPL_GESV_HPP +#endif // KOKKOSLAPACK_IMPL_GESV_HPP diff --git a/blas/impl/KokkosBlas_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp similarity index 74% rename from blas/impl/KokkosBlas_gesv_spec.hpp rename to lapack/impl/KokkosLapack_gesv_spec.hpp index f1dff467c8..b9f8549311 100644 --- a/blas/impl/KokkosBlas_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ -#define KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ +#ifndef KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ +#define KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ #include #include @@ -22,10 +22,10 @@ // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#include +#include #endif -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -33,37 +33,37 @@ struct gesv_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // Macro for declaration of full specialization availability -// KokkosBlas::Impl::GESV. This is NOT for users!!! All +// KokkosLapack::Impl::GESV. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _INST macro below) across one or // more .cpp files. // -#define KOKKOSBLAS_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct gesv_eti_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct gesv_eti_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations -#include -#include +#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Unification layer -/// \brief Implementation of KokkosBlas::gesv. +/// \brief Implementation of KokkosLapack::gesv. template ::value, @@ -79,54 +79,54 @@ template struct GESV { static void gesv(const AMatrix & /* A */, const BXMV & /* B */, const IPIVV & /* IPIV */) { - // NOTE: Might add the implementation of KokkosBlas::gesv later + // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " - "exists. Enable BLAS and/or MAGMA TPL."); + "exists. Enable LAPACK and/or MAGMA TPL."); } }; #endif } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // Macro for declaration of full specialization of -// KokkosBlas::Impl::GESV. This is NOT for users!!! All +// KokkosLapack::Impl::GESV. This is NOT for users!!! All // the declarations of full specializations go in this header file. // We may spread out definitions (see _DEF macro below) across one or // more .cpp files. // -#define KOKKOSBLAS_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - extern template struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#include +#include -#endif // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ +#endif // KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_trtri_impl.hpp b/lapack/impl/KokkosLapack_trtri_impl.hpp similarity index 91% rename from blas/impl/KokkosBlas_trtri_impl.hpp rename to lapack/impl/KokkosLapack_trtri_impl.hpp index 4501763ea8..9f52c2d412 100644 --- a/blas/impl/KokkosBlas_trtri_impl.hpp +++ b/lapack/impl/KokkosLapack_trtri_impl.hpp @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_IMPL_HPP_ -#define KOKKOSBLAS_TRTRI_IMPL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_IMPL_HPP_ +#define KOKKOSLAPACK_TRTRI_IMPL_HPP_ /** - * \file KokkosBlas_trtri_impl.hpp + * \file KokkosLapack_trtri_impl.hpp * \brief Implementation of triangular matrix inverse */ @@ -27,7 +27,7 @@ #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { template @@ -65,5 +65,5 @@ void SerialTrtri_Invoke(const RViewType &R, const char uplo[], } } } // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSBLAS_TRTRI_IMPL_HPP_ +} // namespace KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp similarity index 77% rename from blas/impl/KokkosBlas_trtri_spec.hpp rename to lapack/impl/KokkosLapack_trtri_spec.hpp index 2a4d2db576..a17184dc41 100644 --- a/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -13,17 +13,17 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_SPEC_HPP_ -#define KOKKOSBLAS_TRTRI_SPEC_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_SPEC_HPP_ +#define KOKKOSLAPACK_TRTRI_SPEC_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#include +#include #endif -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -31,14 +31,14 @@ struct trtri_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // This Macros provides the ETI specialization of trtri, currently not // available. // -#define KOKKOSBLAS_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ - MEM_SPACE) \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_AVAIL(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ Kokkos::View -#include +#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // @@ -77,8 +77,8 @@ struct TRTRI { static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - ? "KokkosBlas::trtri[ETI]" - : "KokkosBlas::trtri[noETI]"); + ? "KokkosLapack::trtri[ETI]" + : "KokkosLapack::trtri[noETI]"); typename AVIT::HostMirror host_A = Kokkos::create_mirror_view(A); typename RVIT::HostMirror host_R = Kokkos::create_mirror_view(R); @@ -97,7 +97,7 @@ struct TRTRI { //! KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack // // These Macros are only included when we are not compiling libkokkoskernels but @@ -106,22 +106,24 @@ struct TRTRI { // "extern template" skips the implicit instatiation step ensuring that the // callers code uses this explicit instantiation definition of TRTRI. // -#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ - extern template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ + extern template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ - template struct TRTRI< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, \ + MEM_SPACE) \ + template struct TRTRI< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; -#include +#include -#endif // KOKKOSBLAS_TRTRI_SPEC_HPP_ +#endif // KOKKOSLAPACK_TRTRI_SPEC_HPP_ diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp new file mode 100644 index 0000000000..4c9058f8ab --- /dev/null +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -0,0 +1,151 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosLapack_gesv.hpp +/// \brief Local dense linear solve +/// +/// This file provides KokkosLapack::gesv. This function performs a +/// local (no MPI) dense linear solve on a system of linear equations +/// A * X = B where A is a general N-by-N matrix and X and B are N-by-NRHS +/// matrices. + +#ifndef KOKKOSLAPACK_GESV_HPP_ +#define KOKKOSLAPACK_GESV_HPP_ + +#include + +#include "KokkosLapack_gesv_spec.hpp" +#include "KokkosKernels_Error.hpp" + +namespace KokkosLapack { + +/// \brief Solve the dense linear equation system A*X = B. +/// +/// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. +/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a +/// 1-D or 2-D Kokkos::View. \tparam IPIVV Output pivot indices, as a 1-D +/// Kokkos::View +/// +/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the +/// factors L and U from +/// the factorization A = P*L*U; the unit diagonal elements of L are not +/// stored. +/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, +/// the solution (multi)vector X. \param IPIV [out] On exit, the pivot indices +/// (for partial pivoting). If the View extents are zero and +/// its data pointer is NULL, pivoting is not used. +/// +template +void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK + // TPL. + // MAGMA TPL should be enabled to call the MAGMA GPU interface for + // device views LAPACK TPL should be enabled to call the LAPACK + // interface for host views + + static_assert(Kokkos::is_view::value, + "KokkosLapack::gesv: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::gesv: B must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::gesv: IPIV must be a Kokkos::View."); + static_assert(static_cast(AMatrix::rank) == 2, + "KokkosLapack::gesv: A must have rank 2."); + static_assert( + static_cast(BXMV::rank) == 1 || static_cast(BXMV::rank) == 2, + "KokkosLapack::gesv: B must have either rank 1 or rank 2."); + static_assert(static_cast(IPIVV::rank) == 1, + "KokkosLapack::gesv: IPIV must have rank 1."); + + int64_t IPIV0 = IPIV.extent(0); + int64_t A0 = A.extent(0); + int64_t A1 = A.extent(1); + int64_t B0 = B.extent(0); + + // Check validity of pivot argument + bool valid_pivot = + (IPIV0 == A1) || ((IPIV0 == 0) && (IPIV.data() == nullptr)); + if (!(valid_pivot)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "Valid options include zero-extent 1-D view (no pivoting), or 1-D " + "View with size of " + << A0 << " (partial pivoting)."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Check for no pivoting case. Only MAGMA supports no pivoting interface +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL + if ((!std::is_same::value) && + (IPIV0 == 0) && (IPIV.data() == nullptr)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif +#else // not have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL + if ((IPIV0 == 0) && (IPIV.data() == nullptr)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif +#endif + + // Check compatibility of dimensions at run time. + if ((A0 < A1) || (A0 != B0)) { + std::ostringstream os; + os << "KokkosLapack::gesv: Dimensions of A, and B do not match: " + << " A: " << A.extent(0) << " x " << A.extent(1) << " B: " << B.extent(0) + << " x " << B.extent(1); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + typedef Kokkos::View< + typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, + typename AMatrix::device_type, Kokkos::MemoryTraits > + AMatrix_Internal; + typedef Kokkos::View > + BXMV_Internal; + typedef Kokkos::View< + typename IPIVV::non_const_value_type*, typename IPIVV::array_layout, + typename IPIVV::device_type, Kokkos::MemoryTraits > + IPIVV_Internal; + AMatrix_Internal A_i = A; + // BXMV_Internal B_i = B; + IPIVV_Internal IPIV_i = IPIV; + + if (BXMV::rank == 1) { + auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); + KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + } else { // BXMV::rank == 2 + auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); + KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + } +} + +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_GESV_HPP_ diff --git a/lapack/src/KokkosLapack_trtri.hpp b/lapack/src/KokkosLapack_trtri.hpp new file mode 100644 index 0000000000..9a884f2303 --- /dev/null +++ b/lapack/src/KokkosLapack_trtri.hpp @@ -0,0 +1,119 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_TRTRI_HPP_ +#define KOKKOSLAPACK_TRTRI_HPP_ + +/// \file KokkosLapack_trtri.hpp + +#include "KokkosKernels_Macros.hpp" +#include "KokkosLapack_trtri_spec.hpp" +#include "KokkosKernels_helpers.hpp" +#include +#include +#include "KokkosKernels_Error.hpp" + +namespace KokkosLapack { + +/// \brief Find the inverse of the triangular matrix, A +/// +/// A = inv(A) +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// +/// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular +/// matrix +/// "L" or "l" indicates matrix A is a lower triangular matrix +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicates the diagonal of A is assumed to be +// non-unit +/// \param A [in,out] Input matrix, as a 2-D Kokkos::View +/// On entry, A +/// On successful exit, inv(A) +/// \return 0 upon success, +// i if the i-th diagonal elemet of A is zero, A is singular, +// and the inversion could not be completed. +// source: https://software.intel.com/en-us/mkl-developer-reference-c-trtri +template +int trtri(const char uplo[], const char diag[], const AViewType& A) { + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + + // Check validity of indicator argument + bool valid_uplo = (uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || + (uplo[0] == 'l'); + bool valid_diag = (diag[0] == 'U') || (diag[0] == 'u') || (diag[0] == 'N') || + (diag[0] == 'n'); + + if (!valid_uplo) { + std::ostringstream os; + os << "KokkosLapack::trtri: uplo = '" << uplo[0] << "'. " + << "Valid values include 'U' or 'u' (A is upper triangular), " + "'L' or 'l' (A is lower triangular)."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (!valid_diag) { + std::ostringstream os; + os << "KokkosLapack::trtri: diag = '" << diag[0] << "'. " + << "Valid values include 'U' or 'u' (the diagonal of A is assumed to be " + "unit), " + "'N' or 'n' (the diagonal of A is assumed to be non-unit)."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + int64_t A_m = A.extent(0); + int64_t A_n = A.extent(1); + + // Return if degenerated matrices are provided + if (A_m == 0 || A_n == 0) + return 0; // This is success as the inverse of a matrix with no elements is + // itself. + + // Ensure that the dimensions of A match and that we can legally perform A*B + // or B*A + if (A_m != A_n) { + std::ostringstream os; + os << "KokkosLapack::trtri: Dimensions of A do not match," + << " A: " << A.extent(0) << " x " << A.extent(1); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Create A matrix view type alias + using AViewInternalType = + Kokkos::View >; + + // This is the return value type and should always reside on host + using RViewInternalType = + Kokkos::View >; + + int result; + RViewInternalType R = RViewInternalType(&result); + + KokkosLapack::Impl::TRTRI::trtri( + R, uplo, diag, A); + + return result; +} + +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_TRTRI_HPP_ diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.cpp b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp new file mode 100644 index 0000000000..2ac28871a4 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.cpp @@ -0,0 +1,18 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#include +#include +#include diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp new file mode 100644 index 0000000000..b59d6d99c8 --- /dev/null +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -0,0 +1,64 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_CUDA_TPL_HPP_ +#define KOKKOSLAPACK_CUDA_TPL_HPP_ + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +#include + +namespace KokkosLapack { +namespace Impl { + +CudaLapackSingleton::CudaLapackSingleton() { + cusolverStatus_t stat = cusolverDnCreate(&handle); + if (stat != CUSOLVER_STATUS_SUCCESS) + Kokkos::abort("CUSOLVER initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); +} + +CudaLapackSingleton& CudaLapackSingleton::singleton() { + static CudaLapackSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#include + +namespace KokkosLapack { +namespace Impl { + +MagmaSingleton::MagmaSingleton() { + magma_int_t stat = magma_init(); + if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { magma_finalize(); }); +} + +MagmaSingleton& MagmaSingleton::singleton() { + static MagmaSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + +#endif // KOKKOSLAPACK_CUDA_TPL_HPP_ diff --git a/lapack/tpls/KokkosLapack_Host_tpl.cpp b/lapack/tpls/KokkosLapack_Host_tpl.cpp new file mode 100644 index 0000000000..d629a17f1d --- /dev/null +++ b/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -0,0 +1,152 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \file KokkosLapack_Host_tpl.cpp +/// \brief LAPACK wrapper for host tpls +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosKernels_config.h" +#include "KokkosLapack_Host_tpl.hpp" + +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) + +/// Fortran headers +extern "C" { + +/// +/// Gesv +/// + +void F77_BLAS_MANGLE(sgesv, SGESV)(int*, int*, float*, int*, int*, float*, int*, + int*); +void F77_BLAS_MANGLE(dgesv, DGESV)(int*, int*, double*, int*, int*, double*, + int*, int*); +void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, + std::complex*, int*, int*); +void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, + int*, std::complex*, int*, int*); + +/// +/// Trtri +/// +/* + HostLapack::trtri(const char uplo, const char diag, + int n, const float *a, int lda) { + int info = 0; + F77_FUNC_STRTRI(&uplo, + &diag, &n, + a, &lda, &info); +*/ +void F77_BLAS_MANGLE(strtri, STRTRI)(const char*, const char*, int*, + const float*, int*, int*); +void F77_BLAS_MANGLE(dtrtri, DTRTRI)(const char*, const char*, int*, + const double*, int*, int*); +void F77_BLAS_MANGLE(ctrtri, CTRTRI)(const char*, const char*, int*, + const std::complex*, int*, int*); +void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, + const std::complex*, int*, int*); +} + +#define F77_FUNC_SGESV F77_BLAS_MANGLE(sgesv, SGESV) +#define F77_FUNC_DGESV F77_BLAS_MANGLE(dgesv, DGESV) +#define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) +#define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) + +#define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) +#define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) +#define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) +#define F77_FUNC_ZTRTRI F77_BLAS_MANGLE(ztrtri, ZTRTRI) + +namespace KokkosLapack { +namespace Impl { + +/// +/// float +/// + +template <> +void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, + float* b, int ldb, int info) { + F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack::trtri(const char uplo, const char diag, int n, + const float* a, int lda) { + int info = 0; + F77_FUNC_STRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// double +/// + +template <> +void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, + double* b, int ldb, int info) { + F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack::trtri(const char uplo, const char diag, int n, + const double* a, int lda) { + int info = 0; + F77_FUNC_DTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// std::complex +/// + +template <> +void HostLapack >::gesv(int n, int rhs, + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { + F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack >::trtri(const char uplo, const char diag, + int n, const std::complex* a, + int lda) { + int info = 0; + F77_FUNC_CTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +/// +/// std::complex +/// + +template <> +void HostLapack >::gesv(int n, int rhs, + std::complex* a, int lda, + int* ipiv, std::complex* b, + int ldb, int info) { + F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); +} +template <> +int HostLapack >::trtri(const char uplo, const char diag, + int n, + const std::complex* a, + int lda) { + int info = 0; + F77_FUNC_ZTRTRI(&uplo, &diag, &n, a, &lda, &info); + return info; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK diff --git a/common/src/KokkosKernels_AlwaysFalse.hpp b/lapack/tpls/KokkosLapack_Host_tpl.hpp similarity index 50% rename from common/src/KokkosKernels_AlwaysFalse.hpp rename to lapack/tpls/KokkosLapack_Host_tpl.hpp index 36f4572d29..d74099aaec 100644 --- a/common/src/KokkosKernels_AlwaysFalse.hpp +++ b/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -14,26 +14,31 @@ // //@HEADER -#ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP -#define KOKKOSKERNELS_ALWAYSFALSE_HPP +#ifndef KOKKOSLAPACK_HOST_TPL_HPP_ +#define KOKKOSLAPACK_HOST_TPL_HPP_ -#include +/// \file KokkosLapack_Host_tpl.hpp +/// \brief LAPACK wrapper -/*! \file KokkosKernels_AlwaysFalse.hpp - \brief A convenience type to be used in a static_assert that should always - fail -*/ +#include "KokkosKernels_config.h" +#include "Kokkos_ArithTraits.hpp" -namespace KokkosKernels { -namespace Impl { +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) -template -using always_false = std::false_type; +namespace KokkosLapack { +namespace Impl { template -inline constexpr bool always_false_v = always_false::value; +struct HostLapack { + static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, + int info); + static int trtri(const char uplo, const char diag, int n, const T *a, + int lda); +}; } // namespace Impl -} // namespace KokkosKernels +} // namespace KokkosLapack + +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK -#endif // KOKKOSKERNELS_ALWAYSFALSE_HPP +#endif // KOKKOSLAPACK_HOST_TPL_HPP_ diff --git a/blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp similarity index 60% rename from blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp rename to lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index f909b4a295..a3d8bb6ee9 100644 --- a/blas/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -14,10 +14,10 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_HPP_ +#ifndef KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists template @@ -25,10 +25,10 @@ struct gesv_tpl_spec_avail { enum : bool { value = false }; }; -// Generic Host side BLAS (could be MKL or whatever) -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +// Generic Host side LAPACK (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -38,30 +38,30 @@ struct gesv_tpl_spec_avail { enum : bool { value = true }; \ }; -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( double, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_FLOAT) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( float, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) #endif */ #endif @@ -69,7 +69,7 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ Kokkos::View, \ @@ -79,36 +79,36 @@ Kokkos::LayoutRight, Kokkos::HostSpace) #endif enum : bool { value = true }; \ }; -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_FLOAT) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex,Kokkos::LayoutRight, Kokkos::CudaSpace) #endif #if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, + KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) #endif */ #endif } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack #endif diff --git a/blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp similarity index 87% rename from blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp rename to lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 7d8f0a8a2b..2baa76a132 100644 --- a/blas/tpls/KokkosBlas_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -14,21 +14,21 @@ // //@HEADER -#ifndef KOKKOSBLAS_GESV_TPL_SPEC_DECL_HPP_ -#define KOKKOSBLAS_GESV_TPL_SPEC_DECL_HPP_ +#ifndef KOKKOSLAPACK_GESV_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_GESV_TPL_SPEC_DECL_HPP_ -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { template inline void gesv_print_specialization() { #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - printf("KokkosBlas::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", + printf("KokkosLapack::gesv<> TPL MAGMA specialization for < %s , %s, %s >\n", typeid(AViewType).name(), typeid(BViewType).name(), typeid(PViewType).name()); #else -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - printf("KokkosBlas::gesv<> TPL Blas specialization for < %s , %s, %s >\n", +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK + printf("KokkosLapack::gesv<> TPL Lapack specialization for < %s , %s, %s >\n", typeid(AViewType).name(), typeid(BViewType).name(), typeid(PViewType).name()); #endif @@ -36,16 +36,16 @@ inline void gesv_print_specialization() { #endif } } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack -// Generic Host side BLAS (could be MKL or whatever) -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#include +// Generic Host side LAPACK (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#define KOKKOSBLAS_DGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -74,7 +74,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -89,65 +89,65 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), \ + B.data(), LDB, info); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS_SGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_BLAS,float]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostBlas::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GESV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View< \ + int*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + PViewType; \ + \ + static void gesv(const AViewType& A, const BViewType& B, \ + const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ + gesv_print_specialization(); \ + const bool with_pivot = \ + !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ + \ + const int N = static_cast(A.extent(1)); \ + const int AST = static_cast(A.stride(1)); \ + const int LDA = (AST == 0) ? 1 : AST; \ + const int BST = static_cast(B.stride(1)); \ + const int LDB = (BST == 0) ? 1 : BST; \ + const int NRHS = static_cast(B.extent(1)); \ + \ + int info = 0; \ + \ + if (with_pivot) { \ + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ + LDB, info); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS_ZGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -178,7 +178,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_BLAS,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -193,7 +193,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -202,7 +202,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_CGESV_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -233,7 +233,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_BLAS,complex]"); \ + "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -248,7 +248,7 @@ namespace Impl { int info = 0; \ \ if (with_pivot) { \ - HostBlas >::gesv( \ + HostLapack >::gesv( \ N, NRHS, reinterpret_cast*>(A.data()), LDA, \ IPIV.data(), reinterpret_cast*>(B.data()), \ LDB, info); \ @@ -257,30 +257,30 @@ namespace Impl { } \ }; -KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_DGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_SGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_ZGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS_CGESV_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) } // namespace Impl -} // namespace KokkosBlas -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#define KOKKOSBLAS_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -309,7 +309,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,double]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -321,8 +321,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -339,7 +339,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV< \ Kokkos::View, \ @@ -368,7 +368,7 @@ namespace Impl { \ static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gesv[TPL_MAGMA,float]"); \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -380,8 +380,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -398,7 +398,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -429,7 +429,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -441,8 +441,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -459,7 +459,7 @@ namespace Impl { } \ }; -#define KOKKOSBLAS_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GESV**, LAYOUT, \ Kokkos::Device, \ @@ -490,7 +490,7 @@ namespace Impl { static void gesv(const AViewType& A, const BViewType& B, \ const PViewType& IPIV) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gesv[TPL_MAGMA,complex]"); \ + "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ gesv_print_specialization(); \ const bool with_pivot = \ !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ @@ -502,8 +502,8 @@ namespace Impl { magma_int_t LDB = (BST == 0) ? 1 : BST; \ magma_int_t NRHS = static_cast(B.extent(1)); \ \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ magma_int_t info = 0; \ \ if (with_pivot) { \ @@ -520,20 +520,20 @@ namespace Impl { } \ }; -KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA #endif diff --git a/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp new file mode 100644 index 0000000000..7251d97086 --- /dev/null +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_avail.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosLapack { +namespace Impl { + +// Specialization struct which defines whether a specialization exists +template +struct trtri_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side LAPACK (could be MKL or whatever) +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ + template \ + struct trtri_tpl_spec_avail< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUTA, MEMSPACE) \ + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) +#else +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUTA, MEMSPACE) +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) \ + KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) +#else +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUTA, MEMSPACE) +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutRight, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutRight, + Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HostSpace) +#ifdef KOKKOS_ENABLE_CUDA +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSLAPACK_TRTRI_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +#endif + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSLAPACKy_TRTRI_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp similarity index 50% rename from blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp rename to lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 46ec894547..3ed0623018 100644 --- a/blas/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -14,18 +14,18 @@ // //@HEADER -#ifndef KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ -#define KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ +#ifndef KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ -#include "KokkosBlas_Host_tpl.hpp" // trtri prototype -#include "KokkosBlas_tpl_spec.hpp" +#include "KokkosLapack_Host_tpl.hpp" // trtri prototype +//#include "KokkosLapack_tpl_spec.hpp" -namespace KokkosBlas { +namespace KokkosLapack { namespace Impl { -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) \ +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ + MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRTRI >, \ @@ -44,8 +44,8 @@ namespace Impl { \ static void trtri(const RViewType& R, const char uplo[], \ const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ const int M = static_cast(A.extent(0)); \ \ bool A_is_layout_left = \ @@ -61,136 +61,164 @@ namespace Impl { else \ uplo_ = A_is_layout_left ? 'U' : 'L'; \ \ - R() = HostBlas::trtri( \ + R() = HostLapack::trtri( \ uplo_, diag[0], M, \ reinterpret_cast(A.data()), LDA); \ Kokkos::Profiling::popRegion(); \ } \ }; #else -#define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ - MEM_SPACE, ETI_SPEC_AVAIL) -#endif // KOKKOSKERNELS_ENABLE_TPL_BLAS +#define KOKKOSLAPACK_TRTRI_LAPACK_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ + MEM_SPACE, ETI_SPEC_AVAIL) +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ - LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRTRI >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ - RViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void trtri(const RViewType& R, const char uplo[], \ - const char diag[], const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trtri[TPL_BLAS," #SCALAR_TYPE \ - "]"); \ - magma_int_t M = static_cast(A.extent(0)); \ - \ - bool A_is_layout_left = \ - std::is_same::value; \ - \ - magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ - LDA = (AST == 0) ? 1 : AST; \ - magma_int_t info = 0; \ - magma_uplo_t uplo_; \ - magma_diag_t diag_; \ - \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ - else \ - uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ - \ - if (diag[0] == 'U' || diag[0] == 'u') \ - diag_ = MagmaUnit; \ - else \ - diag_ = MagmaNonUnit; \ - \ - KokkosBlas::Impl::MagmaSingleton& s = \ - KokkosBlas::Impl::MagmaSingleton::singleton(); \ - R() = MAGMA_FN(uplo_, diag_, M, \ - reinterpret_cast( \ - const_cast(A.data())), \ - LDA, &info); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ + MAGMA_FN, LAYOUTA, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct TRTRI >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View > \ + RViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void trtri(const RViewType& R, const char uplo[], \ + const char diag[], const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::trtri[TPL_LAPACK," #SCALAR_TYPE "]"); \ + magma_int_t M = static_cast(A.extent(0)); \ + \ + bool A_is_layout_left = \ + std::is_same::value; \ + \ + magma_int_t AST = A_is_layout_left ? A.stride(1) : A.stride(0), \ + LDA = (AST == 0) ? 1 : AST; \ + magma_int_t info = 0; \ + magma_uplo_t uplo_; \ + magma_diag_t diag_; \ + \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = A_is_layout_left ? MagmaLower : MagmaUpper; \ + else \ + uplo_ = A_is_layout_left ? MagmaUpper : MagmaLower; \ + \ + if (diag[0] == 'U' || diag[0] == 'u') \ + diag_ = MagmaUnit; \ + else \ + diag_ = MagmaNonUnit; \ + \ + KokkosLapack::Impl::MagmaSingleton& s = \ + KokkosLapack::Impl::MagmaSingleton::singleton(); \ + R() = MAGMA_FN(uplo_, diag_, M, \ + reinterpret_cast( \ + const_cast(A.data())), \ + LDA, &info); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #else -#define KOKKOSBLAS_TRTRI_BLAS_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, MAGMA_FN, \ - LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) +#define KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(SCALAR_TYPE, BASE_SCALAR_TYPE, \ + MAGMA_FN, LAYOUTA, MEM_SPACE, \ + ETI_SPEC_AVAIL) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA // Explicitly define the TRTRI class for all permutations listed below // Handle type and space permutations -#define KOKKOSBLAS_DTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS_STRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ - LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) - -#define KOKKOSBLAS_ZTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ - magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaDoubleComplex_ptr, \ - magma_ztrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ - ETI_SPEC_AVAIL) - -#define KOKKOSBLAS_CTRTRI_BLAS(LAYOUTA, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_HOST(Kokkos::complex, std::complex, \ - LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ - magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaSpace, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS_TRTRI_BLAS_MAGMA(Kokkos::complex, magmaFloatComplex_ptr, \ - magma_ctrtri_gpu, LAYOUTA, Kokkos::CudaUVMSpace, \ - ETI_SPEC_AVAIL) +#ifdef KOKKOS_ENABLE_CUDA + +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(double, magmaDouble_ptr, magma_dtrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(float, magmaFloat_ptr, magma_strtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ + magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ + Kokkos::complex, magmaDoubleComplex_ptr, magma_ztrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ + LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA(Kokkos::complex, \ + magmaFloatComplex_ptr, magma_ctrtri_gpu, \ + LAYOUTA, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_MAGMA( \ + Kokkos::complex, magmaFloatComplex_ptr, magma_ctrtri_gpu, \ + LAYOUTA, Kokkos::CudaUVMSpace, ETI_SPEC_AVAIL) + +#else + +#define KOKKOSLAPACK_DTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(double, double, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_STRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(float, float, LAYOUTA, Kokkos::HostSpace, \ + ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_ZTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, \ + std::complex, LAYOUTA, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) + +#define KOKKOSLAPACK_CTRTRI_LAPACK(LAYOUTA, ETI_SPEC_AVAIL) \ + KOKKOSLAPACK_TRTRI_LAPACK_HOST(Kokkos::complex, std::complex, \ + LAYOUTA, Kokkos::HostSpace, ETI_SPEC_AVAIL) + +#endif // Handle layout permutations -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_DTRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_STRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_ZTRTRI_BLAS(Kokkos::LayoutRight, false) - -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, true) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutLeft, false) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, true) -KOKKOSBLAS_CTRTRI_BLAS(Kokkos::LayoutRight, false) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_DTRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_STRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_ZTRTRI_LAPACK(Kokkos::LayoutRight, false) + +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutLeft, true) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutLeft, false) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutRight, true) +KOKKOSLAPACK_CTRTRI_LAPACK(Kokkos::LayoutRight, false) } // namespace Impl -} // nameSpace KokkosBlas +} // nameSpace KokkosLapack -#endif // KOKKOSBLAS_TRTRI_TPL_SPEC_DECL_HPP_ +#endif // KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ diff --git a/lapack/unit_test/CMakeLists.txt b/lapack/unit_test/CMakeLists.txt new file mode 100644 index 0000000000..a2c2305a12 --- /dev/null +++ b/lapack/unit_test/CMakeLists.txt @@ -0,0 +1,94 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/test_common) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/test_common) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + +##################### +# # +# Define unit-tests # +# # +##################### + +##################### +# # +# Add GPU backends # +# # +##################### +IF (KOKKOS_ENABLE_CUDA) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMPTARGET) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # lapack_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_Lapack.cpp + # COMPONENTS lapack + # ) +ENDIF () + + + +##################### +# # +# Add CPU backends # +# # +##################### +IF (KOKKOS_ENABLE_SERIAL) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMP) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + +IF (KOKKOS_ENABLE_THREADS) + KOKKOSKERNELS_ADD_UNIT_TEST( + lapack_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_Lapack.cpp + COMPONENTS lapack + ) +ENDIF () + diff --git a/lapack/unit_test/Test_Lapack.hpp b/lapack/unit_test/Test_Lapack.hpp new file mode 100644 index 0000000000..815c442884 --- /dev/null +++ b/lapack/unit_test/Test_Lapack.hpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_LAPACK_HPP +#define TEST_LAPACK_HPP + +#include "Test_Lapack_gesv.hpp" +#include "Test_Lapack_trtri.hpp" + +#endif // TEST_LAPACK_HPP diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp similarity index 81% rename from blas/unit_test/Test_Blas_gesv.hpp rename to lapack/unit_test/Test_Lapack_gesv.hpp index 710102137e..06f51b7eb0 100644 --- a/blas/unit_test/Test_Blas_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -14,19 +14,20 @@ // //@HEADER -// only enable this test where KokkosBlas supports gesv: -// CUDA+MAGMA and HOST+BLAS -#if (defined(TEST_CUDA_BLAS_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ - (defined(TEST_OPENMP_BLAS_CPP) || defined(TEST_OPENMPTARGET_BLAS_CPP) || \ - defined(TEST_SERIAL_BLAS_CPP) || defined(TEST_THREADS_BLAS_CPP))) +// only enable this test where KokkosLapack supports gesv: +// CUDA+MAGMA and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || \ + defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ + defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include #include #include -#include +#include #include #include #include @@ -89,15 +90,15 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Solve. try { - KokkosBlas::gesv(A, B, ipiv); + KokkosLapack::gesv(A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) // and no-tpl case bool nopivot_runtime_err = false; bool notpl_runtime_err = false; -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -105,8 +106,8 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { #else notpl_runtime_err = true; #endif -#else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#else // not have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -194,15 +195,15 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, // Solve. try { - KokkosBlas::gesv(A, B, ipiv); + KokkosLapack::gesv(A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) // and no-tpl case bool nopivot_runtime_err = false; bool notpl_runtime_err = false; -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // and have BLAS TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); @@ -210,8 +211,8 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, #else notpl_runtime_err = true; #endif -#else // not have MAGMA TPL -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // but have BLAS TPL +#else // not have MAGMA TPL +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL nopivot_runtime_err = (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); notpl_runtime_err = false; #else @@ -342,16 +343,16 @@ int test_gesv_mrhs(const char* mode) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_float"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_float"); + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_float"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_float"); + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -360,16 +361,16 @@ TEST_F(TestCategory, gesv_mrhs_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_double"); - test_gesv("N"); // No pivoting - test_gesv("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_double"); + test_gesv("N"); // No pivoting + test_gesv("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_double"); - test_gesv_mrhs("N"); // No pivoting - test_gesv_mrhs("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_double"); + test_gesv_mrhs("N"); // No pivoting + test_gesv_mrhs("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -378,17 +379,16 @@ TEST_F(TestCategory, gesv_mrhs_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_double"); - test_gesv, TestExecSpace>("N"); // No pivoting - test_gesv, TestExecSpace>("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_double"); + test_gesv, TestDevice>("N"); // No pivoting + test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_double"); - test_gesv_mrhs, TestExecSpace>("N"); // No pivoting - test_gesv_mrhs, TestExecSpace>( - "Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_double"); + test_gesv_mrhs, TestDevice>("N"); // No pivoting + test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif @@ -397,19 +397,18 @@ TEST_F(TestCategory, gesv_mrhs_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, gesv_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_complex_float"); - test_gesv, TestExecSpace>("N"); // No pivoting - test_gesv, TestExecSpace>("Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_complex_float"); + test_gesv, TestDevice>("N"); // No pivoting + test_gesv, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } TEST_F(TestCategory, gesv_mrhs_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gesv_mrhs_complex_float"); - test_gesv_mrhs, TestExecSpace>("N"); // No pivoting - test_gesv_mrhs, TestExecSpace>( - "Y"); // Partial pivoting + Kokkos::Profiling::pushRegion("KokkosLapack::Test::gesv_mrhs_complex_float"); + test_gesv_mrhs, TestDevice>("N"); // No pivoting + test_gesv_mrhs, TestDevice>("Y"); // Partial pivoting Kokkos::Profiling::popRegion(); } #endif -#endif // CUDA+MAGMA or BLAS+HOST +#endif // CUDA+MAGMA or LAPACK+HOST diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/lapack/unit_test/Test_Lapack_trtri.hpp similarity index 88% rename from blas/unit_test/Test_Blas_trtri.hpp rename to lapack/unit_test/Test_Lapack_trtri.hpp index 0bebb9edf0..a19e575d89 100644 --- a/blas/unit_test/Test_Blas_trtri.hpp +++ b/lapack/unit_test/Test_Lapack_trtri.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -118,8 +118,8 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, // const int As0 = A.stride(0), As1 = A.stride(1); // const int Ae0 = A.extent(0), Ae1 = A.extent(1); - // printf("KokkosBlas::trtri test for %c %c, M %d, N %d, eps %g, ViewType: %s, - // A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d + // printf("KokkosLapack::trtri test for %c %c, M %d, N %d, eps %g, ViewType: + // %s, A.stride(0): %d, A.stride(1): %d, A.extent(0): %d, A.extent(1): %d // START\n", uplo[0],diag[0],M,N,eps,typeid(ViewTypeA).name(), As0, As1, Ae0, // Ae1); fflush(stdout); @@ -141,7 +141,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, host_A(bad_diag_idx - 1, bad_diag_idx - 1) = ScalarA(0); Kokkos::deep_copy(A, host_A); } - return KokkosBlas::trtri(uplo, diag, A); + return KokkosLapack::trtri(uplo, diag, A); } // If M is greater than 100 and A is an unit triangluar matrix, make A the @@ -158,13 +158,13 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, using functor_type = UnitDiagTRTRI; functor_type udtrtri(A); // Initialize As diag with 1s - Kokkos::parallel_for("KokkosBlas::Test::UnitDiagTRTRI", + Kokkos::parallel_for("KokkosLapack::Test::UnitDiagTRTRI", Kokkos::RangePolicy(0, M), udtrtri); } else { //(diag[0]=='N')||(diag[0]=='n') using functor_type = NonUnitDiagTRTRI; functor_type nudtrtri(A); // Initialize As diag with A(i,i)+10 - Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRTRI", + Kokkos::parallel_for("KokkosLapack::Test::NonUnitDiagTRTRI", Kokkos::RangePolicy(0, M), nudtrtri); } Kokkos::fence(); @@ -195,11 +195,11 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, #endif // A = A^-1 - ret = KokkosBlas::trtri(uplo, diag, A); + ret = KokkosLapack::trtri(uplo, diag, A); Kokkos::fence(); if (ret) { - printf("KokkosBlas::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], + printf("KokkosLapack::trtri(%c, %c, %s) returned %d\n", uplo[0], diag[0], typeid(ViewTypeA).name(), ret); return ret; } @@ -229,7 +229,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, vgemm.alpha = ScalarA(1); vgemm.beta = beta; Kokkos::parallel_for( - "KokkosBlas::Test::VanillaGEMM", + "KokkosLapack::Test::VanillaGEMM", Kokkos::TeamPolicy( M, Kokkos::AUTO, KokkosKernels::Impl::kk_get_max_vector_size()), @@ -362,11 +362,11 @@ int test_trtri(const char* mode) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_float"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_float"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -375,11 +375,11 @@ TEST_F(TestCategory, trtri_float) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_double"); - test_trtri("UN"); - test_trtri("UU"); - test_trtri("LN"); - test_trtri("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_double"); + test_trtri("UN"); + test_trtri("UU"); + test_trtri("LN"); + test_trtri("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -388,11 +388,11 @@ TEST_F(TestCategory, trtri_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_double"); - test_trtri, TestExecSpace>("UN"); - test_trtri, TestExecSpace>("UU"); - test_trtri, TestExecSpace>("LN"); - test_trtri, TestExecSpace>("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_double"); + test_trtri, TestDevice>("UN"); + test_trtri, TestDevice>("UU"); + test_trtri, TestDevice>("LN"); + test_trtri, TestDevice>("LU"); Kokkos::Profiling::popRegion(); } #endif @@ -401,11 +401,11 @@ TEST_F(TestCategory, trtri_complex_double) { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, trtri_complex_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::trtri_complex_float"); - test_trtri, TestExecSpace>("UN"); - test_trtri, TestExecSpace>("UU"); - test_trtri, TestExecSpace>("LN"); - test_trtri, TestExecSpace>("LU"); + Kokkos::Profiling::pushRegion("KokkosLapack::Test::trtri_complex_float"); + test_trtri, TestDevice>("UN"); + test_trtri, TestDevice>("UU"); + test_trtri, TestDevice>("LN"); + test_trtri, TestDevice>("LU"); Kokkos::Profiling::popRegion(); } #endif diff --git a/lapack/unit_test/backends/Test_Cuda_Lapack.cpp b/lapack/unit_test/backends/Test_Cuda_Lapack.cpp new file mode 100644 index 0000000000..d75988ef81 --- /dev/null +++ b/lapack/unit_test/backends/Test_Cuda_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_CUDA_LAPACK_CPP +#define TEST_CUDA_LAPACK_CPP + +#include +#include + +#endif // TEST_CUDA_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_HIP_Lapack.cpp b/lapack/unit_test/backends/Test_HIP_Lapack.cpp new file mode 100644 index 0000000000..c0ec152233 --- /dev/null +++ b/lapack/unit_test/backends/Test_HIP_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_HIP_LAPACK_CPP +#define TEST_HIP_LAPACK_CPP + +#include "Test_HIP.hpp" +#include "Test_Lapack.hpp" + +#endif // TEST_HIP_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp b/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp new file mode 100644 index 0000000000..5191918ce9 --- /dev/null +++ b/lapack/unit_test/backends/Test_OpenMPTarget_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMPTARGET_LAPACK_CPP +#define TEST_OPENMPTARGET_LAPACK_CPP + +#include "Test_OpenMPTarget.hpp" +#include "Test_Lapack.hpp" + +#endif // TEST_OPENMPTARGET_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp b/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp new file mode 100644 index 0000000000..533580fd23 --- /dev/null +++ b/lapack/unit_test/backends/Test_OpenMP_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_LAPACK_CPP +#define TEST_OPENMP_LAPACK_CPP + +#include +#include + +#endif // TEST_OPENMP_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_SYCL_Lapack.cpp b/lapack/unit_test/backends/Test_SYCL_Lapack.cpp new file mode 100644 index 0000000000..9485f2a420 --- /dev/null +++ b/lapack/unit_test/backends/Test_SYCL_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_LAPACK_CPP +#define TEST_SYCL_LAPACK_CPP + +#include +#include + +#endif // TEST_SYCL_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_Serial_Lapack.cpp b/lapack/unit_test/backends/Test_Serial_Lapack.cpp new file mode 100644 index 0000000000..d0324b9642 --- /dev/null +++ b/lapack/unit_test/backends/Test_Serial_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_LAPACK_CPP +#define TEST_SERIAL_LAPACK_CPP + +#include +#include + +#endif // TEST_SERIAL_LAPACK_CPP diff --git a/lapack/unit_test/backends/Test_Threads_Lapack.cpp b/lapack/unit_test/backends/Test_Threads_Lapack.cpp new file mode 100644 index 0000000000..aa1acbcf6c --- /dev/null +++ b/lapack/unit_test/backends/Test_Threads_Lapack.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_LAPACK_CPP +#define TEST_THREADS_LAPACK_CPP + +#include +#include + +#endif // TEST_THREADS_LAPACK_CPP diff --git a/master_history.txt b/master_history.txt index e7ed75b7f0..2ece96fed6 100644 --- a/master_history.txt +++ b/master_history.txt @@ -22,3 +22,4 @@ tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6 tag: 4.0.00 date: 02/23/2023 master: b4014bf2 release: a10dff20 tag: 4.0.01 date: 04/26/2023 master: b9c1bab7 release: 8809e41c tag: 4.1.00 date: 06/20/2023 master: 1331baf1 release: 14ad220a +tag: 4.2.00 date: 11/09/2023 master: 25a31f88 release: 912d3778 diff --git a/ode/CMakeLists.txt b/ode/CMakeLists.txt index 9d92dc07ba..b9cf089445 100644 --- a/ode/CMakeLists.txt +++ b/ode/CMakeLists.txt @@ -11,5 +11,8 @@ ENDIF() # Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) +# Note BMK: Since ODE has no auto-generated ETI files, this directory does not exist in a build without unit tests. +# This causes configure errors when building an app against a Trilinos install, and the unit test build dir doesn't contain any headers that need to be found. +# But uncomment the next line if ETI headers are ever added. +# KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode) diff --git a/ode/impl/KokkosODE_Newton_impl.hpp b/ode/impl/KokkosODE_Newton_impl.hpp new file mode 100644 index 0000000000..d5000a74ab --- /dev/null +++ b/ode/impl/KokkosODE_Newton_impl.hpp @@ -0,0 +1,95 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_NEWTON_IMPL_HPP +#define KOKKOSODE_NEWTON_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosBlas1_scal.hpp" +#include "KokkosBlas1_axpby.hpp" + +#include "KokkosODE_Types.hpp" + +namespace KokkosODE { +namespace Impl { + +template +KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( + system_type& sys, const KokkosODE::Experimental::Newton_params& params, + mat_type& J, mat_type& tmp, vec_type& y0, vec_type& rhs, vec_type& update) { + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + using value_type = typename vec_type::non_const_value_type; + + // Define the type returned by nrm2 to store + // the norm of the residual. + using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename vec_type::non_const_value_type>::mag_type; + norm_type norm = Kokkos::ArithTraits::zero(); + + // LBV - 07/24/2023: for now assume that we take + // a full Newton step. Eventually this value can + // be computed using a line search algorithm to + // improve convergence for difficult problems. + const value_type alpha = Kokkos::ArithTraits::one(); + + // Iterate until maxIts or the tolerance is reached + for (int it = 0; it < params.max_iters; ++it) { // handle.maxIters; ++it) { + // compute initial rhs + sys.residual(y0, rhs); + + // Solve the following linearized + // problem at each iteration: J*update=-rhs + // with J=du/dx, rhs=f(u_n+update)-f(u_n) + norm = KokkosBlas::serial_nrm2(rhs); + + if ((norm < params.rel_tol) || + (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { + return newton_solver_status::NLS_SUCCESS; + } + + // compute LHS + sys.jacobian(y0, J); + + // solve linear problem + int linSolverStat = + KokkosBatched::SerialGesv::invoke( + J, update, rhs, tmp); + KokkosBlas::SerialScale::invoke(-1, update); + + if (linSolverStat == 1) { +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Linear solve gesv returned failure! \n"); +#else + Kokkos::printf("NewtonFunctor: Linear solve gesv returned failure! \n"); +#endif + return newton_solver_status::LIN_SOLVE_FAIL; + } + + // update solution // x = x + alpha*update + KokkosBlas::serial_axpy(alpha, update, y0); + } + return newton_solver_status::MAX_ITER; +} + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSODE_NEWTON_IMPL_HPP diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index 791093c8db..f5fe39d65d 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -48,7 +48,7 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, // now accumulate y_new += dt*b_i*k_i { // we always start with y_new += dt*b_0*k0 - auto k0 = Kokkos::subview(k_vecs, Kokkos::ALL, 0); + auto k0 = Kokkos::subview(k_vecs, 0, Kokkos::ALL); ode.evaluate_function(t + table.c[0] * dt, dt, y_old, k0); for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { y_new(eqIdx) += dt * table.b[0] * k0(eqIdx); @@ -65,12 +65,12 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, for (int idx = 0; idx < stageIdx; ++idx) { for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { temp(eqIdx) += - table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(eqIdx, idx); + table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(idx, eqIdx); } } KokkosBlas::SerialScale::invoke(dt, temp); KokkosBlas::serial_axpy(1, y_old, temp); - auto k = Kokkos::subview(k_vecs, Kokkos::ALL, stageIdx); + auto k = Kokkos::subview(k_vecs, stageIdx, Kokkos::ALL); ode.evaluate_function(t + table.c[stageIdx] * dt, dt, temp, k); for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { y_new(eqIdx) += dt * table.b[stageIdx] * k(eqIdx); @@ -82,7 +82,7 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { temp(eqIdx) = 0; for (int stageIdx = 0; stageIdx < nstages; ++stageIdx) { - temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(eqIdx, stageIdx); + temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(stageIdx, eqIdx); } } } @@ -97,20 +97,38 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { constexpr scalar_type error_threshold = 1; bool adapt = params.adaptivity; + bool dt_was_reduced; if (std::is_same_v>) { adapt = false; } - scalar_type dt = (t_end - t_start) / params.max_steps; - scalar_type t = t_start; - for (int stepIdx = 0; (stepIdx < params.max_steps) && (t < t_end); + // Set current time and initial time step + scalar_type t_now = t_start; + scalar_type dt = (t_end - t_start) / params.max_steps; + + // Loop over time steps to integrate ODE + for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); ++stepIdx) { - // Set err to be arbitrarily larger than our threshold of 1 + // Check that the step attempted is not putting + // the solution past t_end, otherwise shrink dt + if (t_end < t_now + dt) { + dt = t_end - t_now; + } + + // Set error to be arbitrarily larger than our threshold + // so we can pass the initial check. Also reset + // dt_was_reduced to false for current time step. scalar_type error = 2 * error_threshold; scalar_type tol = 0; + dt_was_reduced = false; + + // Take tentative steps until the requested error + // is met. This of course only works for adaptive + // solvers, for fix time steps we simply do not + // compute and check what error of the current step while (error_threshold < error) { // Take a step of Runge-Kutta integrator - RKStep(ode, table, adapt, t, dt, y0, y, temp, k_vecs); + RKStep(ode, table, adapt, t_now, dt, y0, y, temp, k_vecs); // Compute the largest error and decide on // the size of the next time step to take. @@ -131,44 +149,34 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( // is rejected. if (error > 1) { dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + dt_was_reduced = true; } + if (dt < params.min_step_size) return Experimental::ode_solver_status::MIN_SIZE; } } - // Update y0 to stage the next time step. + // Update time and initial condition for next time step + t_now += dt; for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y0(eqIdx) = y(eqIdx); } - if (t < t_end) { - // We may want to print the evolution of the solution over time - // with something similar to the statement below but will need - // to generalize it and make it GPU friendly first, also it - // should be guarded when not doing a debug run, this prints - // a lot... - // std::cout << " step " << stepIdx << " t=" << t << ", y={"; - // for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { - // std::cout << y(eqIdx) << " "; - // } - // std::cout << "}" << std::endl; - if (adapt) { + if (t_now < t_end) { + if (adapt && !dt_was_reduced && error < 0.5) { // Compute new time increment dt = dt * Kokkos::min( 10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); - } else { - // Use same increment - t += dt; } } else { return Experimental::ode_solver_status::SUCCESS; } } - if (t < t_end) return Experimental::ode_solver_status::MAX_STEP; + if (t_now < t_end) return Experimental::ode_solver_status::MAX_STEP; return Experimental::ode_solver_status::SUCCESS; } // RKSolve diff --git a/ode/src/KokkosODE_Newton.hpp b/ode/src/KokkosODE_Newton.hpp new file mode 100644 index 0000000000..94c96e2eea --- /dev/null +++ b/ode/src/KokkosODE_Newton.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_NEWTON_HPP +#define KOKKOSODE_NEWTON_HPP + +/// \author Luc Berger-Vergiat (lberge@sandia.gov) +/// \file KokkosODE_Newton.hpp + +#include "Kokkos_Core.hpp" + +#include "KokkosODE_Types.hpp" +#include "KokkosODE_Newton_impl.hpp" + +namespace KokkosODE { +namespace Experimental { + +/// \brief Newton solver for non-linear system of equations +struct Newton { + template + KOKKOS_FUNCTION static newton_solver_status Solve( + const system_type& sys, const Newton_params& params, const mat_type& J, + const mat_type& tmp, const vec_type& y0, const vec_type& rhs, + const vec_type& update) { + return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update); + } +}; + +} // namespace Experimental +} // namespace KokkosODE + +#endif // KOKKOSODE_NEWTON_HPP diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp index c41d79c1ef..b4711de81c 100644 --- a/ode/src/KokkosODE_RungeKutta.hpp +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -17,7 +17,7 @@ #ifndef KOKKOSODE_RUNGEKUTTA_HPP #define KOKKOSODE_RUNGEKUTTA_HPP -/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \author Luc Berger-Vergiat (lberge@sandia.gov) /// \file KokkosODE_RungeKutta.hpp #include "Kokkos_Core.hpp" diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 136ff75536..7d78227526 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -51,6 +51,25 @@ struct ODE_params { min_step_size(min_step_size_) {} }; +enum newton_solver_status : int { + NLS_SUCCESS = 0, + MAX_ITER = 1, + LIN_SOLVE_FAIL = 2 +}; + +struct Newton_params { + int max_iters; + double abs_tol, rel_tol; + + // Constructor that only specify the desired number of steps. + // In this case no adaptivity is provided, the time step will + // be constant such that dt = (tend - tstart) / num_steps; + KOKKOS_FUNCTION + Newton_params(const int max_iters_, const double abs_tol_, + const double rel_tol_) + : max_iters(max_iters_), abs_tol(abs_tol_), rel_tol(rel_tol_) {} +}; + } // namespace Experimental } // namespace KokkosODE #endif // KOKKOSODE_TYPES_HPP diff --git a/ode/unit_test/Test_ODE.hpp b/ode/unit_test/Test_ODE.hpp index dd929c48fc..5d4861879b 100644 --- a/ode/unit_test/Test_ODE.hpp +++ b/ode/unit_test/Test_ODE.hpp @@ -16,7 +16,11 @@ #ifndef TEST_ODE_HPP #define TEST_ODE_HPP +// Explicit integrators #include "Test_ODE_RK.hpp" #include "Test_ODE_RK_chem.hpp" +// Implicit integrators +#include "Test_ODE_Newton.hpp" + #endif // TEST_ODE_HPP diff --git a/ode/unit_test/Test_ODE_Newton.hpp b/ode/unit_test/Test_ODE_Newton.hpp new file mode 100644 index 0000000000..d235df1e56 --- /dev/null +++ b/ode/unit_test/Test_ODE_Newton.hpp @@ -0,0 +1,550 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_Newton.hpp" + +namespace Test { + +template +struct NewtonSolve_wrapper { + using newton_params = KokkosODE::Experimental::Newton_params; + + system_type my_nls; + newton_params params; + + vec_type x, rhs, update; + mat_type J, tmp; + status_view status; + + NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, + const vec_type& x_, const vec_type& rhs_, + const vec_type& update_, const mat_type& J_, + const mat_type& tmp_, const status_view& status_) + : my_nls(my_nls_), + params(params_), + x(x_), + rhs(rhs_), + update(update_), + J(J_), + tmp(tmp_), + status(status_) {} + + KOKKOS_FUNCTION + void operator()(const int idx) const { + // Take subviews to create the local problem + auto local_x = Kokkos::subview( + x, Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1)))); + auto local_rhs = Kokkos::subview( + rhs, Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1)))); + auto local_update = Kokkos::subview( + update, + Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1)))); + auto local_J = Kokkos::subview( + J, + Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1))), + Kokkos::ALL()); + auto local_tmp = Kokkos::subview( + tmp, + Kokkos::pair(static_cast(my_nls.neqs * idx), + static_cast(my_nls.neqs * (idx + 1))), + Kokkos::ALL()); + + // Run Newton nonlinear solver + status(idx) = KokkosODE::Experimental::Newton::Solve( + my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update); + } +}; + +template +void run_newton_test(const system_type& mySys, + KokkosODE::Experimental::Newton_params& params, + const scalar_type* const initial_val, + const scalar_type* const solution) { + using execution_space = typename Device::execution_space; + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + + Kokkos::View status("Newton status", 1); + + vec_type x("solution vector", mySys.neqs), + rhs("right hand side vector", mySys.neqs); + auto x_h = Kokkos::create_mirror_view(x); + auto r_h = Kokkos::create_mirror_view(rhs); + + vec_type update("update", mySys.neqs); + mat_type J("jacobian", mySys.neqs, mySys.neqs), + tmp("temp mem", mySys.neqs, mySys.neqs + 4); + + // Initial values + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + x_h(eqIdx) = initial_val[eqIdx]; + } + Kokkos::deep_copy(x, x_h); + + Kokkos::RangePolicy my_policy(0, 1); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, + status); + + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto status_h = Kokkos::create_mirror_view(status); + Kokkos::deep_copy(status_h, status); + EXPECT_TRUE(status_h(0) == newton_solver_status::NLS_SUCCESS); + + Kokkos::deep_copy(x_h, x); + Kokkos::deep_copy(r_h, rhs); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Non-linear problem solution and residual:" << std::endl; + std::cout << " [("; + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + std::cout << " " << x_h(eqIdx); + } + std::cout << " ), " << KokkosBlas::serial_nrm2(rhs) << ", ("; + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + std::cout << " " + << Kokkos::abs(x_h(eqIdx) - solution[eqIdx]) / + Kokkos::abs(solution[eqIdx]); + } + std::cout << " )]" << std::endl; +#else + (void)solution; +#endif +} + +// Quadratic equation +// x^2 - x - 2 = 0 +// Solution: x = 2 or x = -1 +// Derivative 2*x - 1 +template +struct QuadraticEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 1; + + QuadraticEquation() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = y(0) * y(0) - y(0) - 2; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0) - 1; + } +}; + +// Trigonometric equation +// f(x) = cos(x) - x = 0 +// Solution: 0.739085 +// f'(x) = -sin(x) - 1 +template +struct TrigonometricEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 1; + + TrigonometricEquation() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = Kokkos::cos(y(0)) - y(0); + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = -Kokkos::sin(y(0)) - 1; + } +}; + +// Logarithmic equation +// f(x) = 7x - log(7x) - 1 = 0 +// Solution: 1/7 = 0.14285714285 +// f'(x) = 7 - (1 / x) +template +struct LogarithmicEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 1; + + LogarithmicEquation() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = 7 * y(0) - Kokkos::log(7 * y(0)) - 1; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 7 - 1 / y(0); + } +}; + +template +void test_newton_status() { + using execution_space = typename Device::execution_space; + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); + Kokkos::View status("newton solver status", 1); + auto status_h = Kokkos::create_mirror_view(status); + + // Create the non-linear system and initialize data + QuadraticEquation my_system{}; + + scalar_type initial_value[3] = {1.0, -0.5, 0.5}; +#ifdef HAVE_KOKKOSKERNELS_DEBUG + scalar_type solution[3] = {2.0, -1.0, 0.0}; +#endif + newton_solver_status newton_status[3] = { + newton_solver_status::NLS_SUCCESS, newton_solver_status::MAX_ITER, + newton_solver_status::LIN_SOLVE_FAIL}; + vec_type x("solution vector", 1), rhs("right hand side vector", 1); + auto x_h = Kokkos::create_mirror_view(x); + auto r_h = Kokkos::create_mirror_view(rhs); + + vec_type update("update", 1); + mat_type J("jacobian", 1, 1), tmp("temp mem", 1, 5); + + for (int idx = 0; idx < 3; ++idx) { + params.max_iters = (idx == 1) ? 2 : 50; + Kokkos::deep_copy(x, initial_value[idx]); + + Kokkos::RangePolicy my_policy(0, 1); + NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, + status); + Kokkos::parallel_for(my_policy, solve_wrapper); + + Kokkos::deep_copy(status_h, status); + EXPECT_TRUE(status_h(0) == newton_status[idx]); + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + Kokkos::deep_copy(x_h, x); + Kokkos::deep_copy(r_h, rhs); + printf("Non-linear problem solution and residual with initial value %f:\n", + initial_value[idx]); + printf(" [%f, %g, %g]\n", x_h(0), r_h(0), + Kokkos::abs(x_h(0) - solution[idx]) / Kokkos::abs(solution[idx])); +#endif + } +} + +template +void test_simple_problems() { + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); + + { + // Test the Newton solver on a quadratci equation + // with two different initial guess that lead to + // the two solutions of the equation. +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Quadratic Equation problem" << std::endl; +#endif + using system_type = QuadraticEquation; + system_type mySys{}; + scalar_type initial_value[2] = {1.0, -0.5}, solution[2] = {2.0, -1.0}; + for (int idx = 0; idx < 2; ++idx) { + run_newton_test( + mySys, params, &(initial_value[idx]), &(solution[idx])); + } +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Quadratic Equation problem" << std::endl; +#endif + } + + { + // Test the Newton solver on a trigonometric equation +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Trigonometric Equation problem" << std::endl; +#endif + using system_type = TrigonometricEquation; + system_type mySys{}; + scalar_type initial_value[1] = {0.1}, solution[1] = {0.739085}; + run_newton_test(mySys, params, + initial_value, solution); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Trigonometric Equation problem" << std::endl; +#endif + } + + { + // Test the Newton solver on a logarithmic equation +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Logarithmic Equation problem" << std::endl; +#endif + using system_type = LogarithmicEquation; + system_type mySys{}; + scalar_type initial_value[1] = {static_cast(0.5)}, + solution[1] = {static_cast(1.0) / + static_cast(7.0)}; + run_newton_test(mySys, params, + initial_value, solution); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Logarithmic Equation problem" << std::endl; +#endif + } +} + +/////////////////////////////////////// +// Now solving systems of equations // +// To make things more realistic and // +// interesting. // +/////////////////////////////////////// + +// Intersections of two circles +// Equations: f0 = (x-0)**2 + (y-0)**2 - 4.00 = 0 +// f1 = (x-3)**2 + (y-0)**2 - 2.25 = 0 +// +// Jacobian: J00 = 2*x J01 = 2*y +// J10 = 2*(x-3) J11 = 2*y +// +// Solution: x = 10.75/6 y = +/- sqrt(2.25 + 7.25/6) +// ~ 1.7916666 ~ +/- 0.8887803753 +template +struct CirclesIntersections { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 2; + + CirclesIntersections() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = y(0) * y(0) + y(1) * y(1) - 4; + f(1) = (y(0) - 3) * (y(0) - 3) + y(1) * y(1) - 2.25; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0); + jac(0, 1) = 2 * y(1); + jac(1, 0) = 2 * (y(0) - 3); + jac(1, 1) = 2 * y(1); + } +}; + +// Intersections of a circle and an hyperbola +// Equations: f0 = x**2 + y**2 - 4.00 = 0 +// f1 = x*y - 1 = 0 --> also y = 1 / x +// +// Jacobian: J00 = 2*x J01 = 2*y +// J10 = y J11 = x +// +// Solution: x = +/- sqrt( (4 +/- sqrt(12)) / 2); y = 1 / x +// x0~ 1.9318516525 y0~ 0.5176380902 +// x1~ 0.5176380902 y1~ 1.9318516525 +// x2~ -0.5176380902 y2~ -1.9318516525 +// x3~ -1.9318516525 y3~ -0.5176380902 +template +struct CircleHyperbolaIntersection { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + static constexpr int neqs = 2; + + CircleHyperbolaIntersection() {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + f(0) = y(0) * y(0) + y(1) * y(1) - 4; + f(1) = y(0) * y(1) - 1; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0); + jac(0, 1) = 2 * y(1); + jac(1, 0) = y(1); + jac(1, 1) = y(0); + } +}; + +template +void test_simple_systems() { + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + KokkosODE::Experimental::Newton_params params(50, abs_tol, rel_tol); + + { + // First problem: intersection of two circles +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Circles Intersetcion problem" << std::endl; +#endif + using system_type = CirclesIntersections; + system_type mySys{}; + scalar_type initial_values[2] = {1.5, 1.5}; + scalar_type solution[2] = {10.75 / 6, 0.8887803753}; + run_newton_test(mySys, params, + initial_values, solution); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Circles Intersetcion problem" << std::endl; +#endif + } + + { + // Second problem: circle / hyperbola intersection +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "\nStarting Circle/Hyperbola Intersetcion problem" + << std::endl; +#endif + using system_type = CircleHyperbolaIntersection; + system_type mySys{}; + + scalar_type init_vals[2] = {0.0, 1.0}; + scalar_type solutions[2] = { + Kokkos::ArithTraits::one() / + Kokkos::sqrt(static_cast( + 4 + Kokkos::sqrt(static_cast(12.0)) / 2)), + Kokkos::sqrt(static_cast( + (4 + Kokkos::sqrt(static_cast(12.0))) / 2))}; + run_newton_test(mySys, params, init_vals, + solutions); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished Circle/Hyperbola Intersetcion problem" << std::endl; +#endif + } +} + +//////////////////////////////////////////// +// Finally, solving systems of equations // +// within a parallel_for loop as it would // +// happen within a FE/FD code. // +//////////////////////////////////////////// + +template +void test_newton_on_device() { + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using newton_params = KokkosODE::Experimental::Newton_params; + using system_type = CircleHyperbolaIntersection; + using newton_solver_status = KokkosODE::Experimental::newton_solver_status; + + double abs_tol, rel_tol; + if (std::is_same_v) { + rel_tol = 10e-5; + abs_tol = 10e-7; + } else if (std::is_same_v) { + rel_tol = 10e-8; + abs_tol = 10e-15; + } else { + throw std::runtime_error("scalar_type is neither float, nor double!"); + } + + constexpr int num_systems = 1000; + const newton_params params(50, abs_tol, rel_tol); + + system_type mySys{}; + + vec_type x("solution vector", mySys.neqs * num_systems); + vec_type rhs("right hand side vector", mySys.neqs * num_systems); + vec_type update("update", mySys.neqs * num_systems); + mat_type J("jacobian", mySys.neqs * num_systems, mySys.neqs); + mat_type tmp("temp mem", mySys.neqs * num_systems, mySys.neqs + 4); + + Kokkos::View status("solver status", + num_systems); + + auto x_h = Kokkos::create_mirror_view(x); + auto r_h = Kokkos::create_mirror_view(rhs); + + // Initial values + scalar_type initial_val[2] = {0.0, 1.0}; + for (int sysIdx = 0; sysIdx < num_systems; ++sysIdx) { + x_h(2 * sysIdx) = initial_val[0]; + x_h(2 * sysIdx + 1) = initial_val[1]; + } + Kokkos::deep_copy(x, x_h); + + Kokkos::RangePolicy my_policy(0, num_systems); + NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, + status); + + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + + auto status_h = Kokkos::create_mirror_view(status); + Kokkos::deep_copy(status_h, status); + Kokkos::deep_copy(x_h, x); + for (int sysIdx = 0; sysIdx < num_systems; ++sysIdx) { + EXPECT_TRUE(status_h(sysIdx) == newton_solver_status::NLS_SUCCESS) + << "System " << sysIdx << " did not report a successful convergence!"; + } +} + +} // namespace Test + +// No ETI is performed for these device routines +// Just pick scalar types at will... +TEST_F(TestCategory, Newton_status_float) { + ::Test::test_newton_status(); +} +TEST_F(TestCategory, Newton_status_double) { + ::Test::test_newton_status(); +} + +TEST_F(TestCategory, Newton_simple_float) { + ::Test::test_simple_problems(); +} +TEST_F(TestCategory, Newton_simple_double) { + ::Test::test_simple_problems(); +} + +TEST_F(TestCategory, Newton_system_float) { + ::Test::test_simple_systems(); +} +TEST_F(TestCategory, Newton_system_double) { + ::Test::test_simple_systems(); +} + +TEST_F(TestCategory, Newton_parallel_float) { + ::Test::test_newton_on_device(); +} +TEST_F(TestCategory, Newton_parallel_double) { + ::Test::test_newton_on_device(); +} diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 1e851108f3..c7d1a84865 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -130,7 +130,7 @@ void test_method(const std::string label, ode_type& my_ode, KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", my_ode.neqs); - mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); Kokkos::RangePolicy my_policy(0, 1); RKSolve_wrapper @@ -152,11 +152,11 @@ void test_method(const std::string label, ode_type& my_ode, (void)label; #endif for (int stageIdx = 0; stageIdx < solver_type::num_stages(); ++stageIdx) { - EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(0, stageIdx), 1e-8); - EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(1, stageIdx), 1e-8); + EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(stageIdx, 0), 1e-8); + EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(stageIdx, 1), 1e-8); #if defined(HAVE_KOKKOSKERNELS_DEBUG) - std::cout << " k" << stageIdx << "={" << kstack_h(0, stageIdx) << ", " - << kstack_h(1, stageIdx) << "}" << std::endl; + std::cout << " k" << stageIdx << "={" << kstack_h(stageIdx, 0) << ", " + << kstack_h(stageIdx, 1) << "}" << std::endl; #endif } EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); @@ -174,11 +174,12 @@ void test_method(const std::string label, ode_type& my_ode, } // test_method -template +template void test_RK() { - using RK_type = KokkosODE::Experimental::RK_type; - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using execution_space = typename Device::execution_space; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; @@ -322,7 +323,7 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, using solver_type = KokkosODE::Experimental::RungeKutta; vec_type tmp("tmp vector", my_ode.neqs); - mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); vec_type y_new("solution", my_ode.neqs); vec_type y_old("intial conditions", my_ode.neqs); @@ -349,11 +350,12 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, } // test_method -template +template void test_convergence_rate() { - using RK_type = KokkosODE::Experimental::RK_type; - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using execution_space = typename Device::execution_space; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; @@ -463,19 +465,110 @@ void test_convergence_rate() { } } // test_convergence_rate +template +void test_adaptivity() { + using execution_space = typename Device::execution_space; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + + duho my_oscillator(1, 1, 4); + const int neqs = my_oscillator.neqs; + + vec_type y("solution", neqs), f("function", neqs); + auto y_h = Kokkos::create_mirror(y); + y_h(0) = 1; + y_h(1) = 0; + Kokkos::deep_copy(y, y_h); + + constexpr double tstart = 0, tend = 1.024; + constexpr int maxSteps = 512, numSteps = 128; + constexpr double absTol = 1e-14, relTol = 1e-8, minStepSize = 0.001; + vec_type y_new("y new", neqs), y_old("y old", neqs); + + // Since y_old_h will be reused to set initial conditions + // for each method tested we do not want to use + // create_mirror_view which would not do a copy + // when y_old is in HostSpace. + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + + // First compute analytical solution as reference + // and to evaluate the error from each RK method. + vec_type y_ref("reference value", neqs); + auto y_ref_h = Kokkos::create_mirror(y_ref); + { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::RangePolicy my_policy(0, 1); + solution_wrapper wrapper(my_oscillator, tend, y_old, y_ref); + Kokkos::parallel_for(my_policy, wrapper); + + Kokkos::deep_copy(y_ref_h, y_ref); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nAnalytical solution" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" + << std::endl; +#endif + } + + vec_type tmp("tmp vector", neqs); + mv_type kstack( + "k stack", + KokkosODE::Experimental::RungeKutta::num_stages(), neqs); + + Kokkos::RangePolicy my_policy(0, 1); + KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, + minStepSize); + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + RKSolve_wrapper + solve_wrapper(my_oscillator, params, tstart, tend, y_old, y_new, tmp, + kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "Results: " << std::endl; + std::cout << " y_ref={ "; + for (int idx = 0; idx < y_ref_h.extent_int(0); ++idx) { + std::cout << y_ref_h(idx) << " "; + } + std::cout << "}" << std::endl; + std::cout << " y_new={ "; + for (int idx = 0; idx < y_new_h.extent_int(0); ++idx) { + std::cout << y_new_h(idx) << " "; + } + std::cout << "}" << std::endl; + std::cout << " error={ "; + double error; +#endif + + for (int idx = 0; idx < y_new_h.extent_int(0); ++idx) { +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + error = + Kokkos::abs(y_new_h(idx) - y_ref_h(idx)) / Kokkos::abs(y_ref_h(idx)); + std::cout << error << " "; +#endif + EXPECT_NEAR_KK_REL(y_new_h(idx), y_ref_h(idx), 1e-7); + } +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "}" << std::endl; +#endif + +} // test_adaptivity + } // namespace Test -int test_RK() { - Test::test_RK(); - return 1; -} +void test_RK() { Test::test_RK(); } + +void test_RK_conv_rate() { Test::test_convergence_rate(); } -int test_RK_conv_rate() { - Test::test_convergence_rate(); - return 1; -} +void test_RK_adaptivity() { Test::test_adaptivity(); } #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, RKSolve_serial) { test_RK(); } TEST_F(TestCategory, RK_conv_rate) { test_RK_conv_rate(); } +TEST_F(TestCategory, RK_adaptivity) { test_RK_adaptivity(); } #endif diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 2adc202ddc..763f38a013 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -89,12 +89,13 @@ struct chem_model_2 { } }; -template +template void test_chem() { - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; - using RK_type = KokkosODE::Experimental::RK_type; - using solver_type = KokkosODE::Experimental::RungeKutta; + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using RK_type = KokkosODE::Experimental::RK_type; + using solver_type = KokkosODE::Experimental::RungeKutta; { chem_model_1 chem_model; @@ -103,7 +104,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); - mv_type kstack("k stack", neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), neqs); // Set initial conditions vec_type y_new("solution", neqs); @@ -144,7 +145,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); - mv_type kstack("k stack", neqs, solver_type::num_stages()); + mv_type kstack("k stack", solver_type::num_stages(), neqs); // Set initial conditions vec_type y_new("solution", neqs); @@ -188,7 +189,7 @@ void test_chem() { } // namespace Test int test_chem_models() { - Test::test_chem(); + Test::test_chem(); return 1; } diff --git a/perf_test/Benchmark_Utils.hpp b/perf_test/Benchmark_Utils.hpp new file mode 100644 index 0000000000..8f34182f41 --- /dev/null +++ b/perf_test/Benchmark_Utils.hpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP +#define KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP + +namespace KokkosKernelsBenchmark { + +class WrappedBool { + public: + WrappedBool(const bool &val) : val_(val) {} + + operator bool() const { return val_; } + + protected: + bool val_; +}; + +class DieOnError : public WrappedBool { + public: + DieOnError(const bool &val) : WrappedBool(val) {} +}; +class SkipOnError : public WrappedBool { + public: + SkipOnError(const bool &val) : WrappedBool(val) {} +}; + +} // namespace KokkosKernelsBenchmark + +#endif // KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP \ No newline at end of file diff --git a/perf_test/KokkosKernels_perf_test_instantiation.hpp b/perf_test/KokkosKernels_perf_test_instantiation.hpp index 9ed5ec23bc..6844922ddb 100644 --- a/perf_test/KokkosKernels_perf_test_instantiation.hpp +++ b/perf_test/KokkosKernels_perf_test_instantiation.hpp @@ -26,10 +26,20 @@ #error "The macro KOKKOSKERNELS_PERF_TEST_NAME was not defined" #endif +// All perf tests must implement print_options() +void print_options(); + int main_instantiation(int argc, char** argv) { perf_test::CommonInputParams params; perf_test::parse_common_options(argc, argv, params); + // If help is requested with "-h" or "--help", then just print the options + // and quit. + if (params.print_help) { + print_options(); + return 0; + } + /* Assumption is that use_openmp/use_threads variables are */ /* provided as numbers of threads */ int num_threads = 1; diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index 0df96f4494..1303b2370e 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -33,20 +33,56 @@ struct CommonInputParams { int use_openmp = 0; int use_threads = 0; - int repeat = 0; + int repeat = 0; + bool print_help = false; }; std::string list_common_options() { std::ostringstream common_options; common_options - << "\t[Required] BACKEND:\n" - << "\t\t'--threads [numThreads]' |\n" - << "\t\t'--openmp [numThreads]' |\n" - << "\t\t'--cuda [deviceIndex]' |\n" - << "\t\t'--hip [deviceIndex]' |\n" - << "\t\t'--sycl [deviceIndex]'\n\n" - << "\tIf no parallel backend is requested, Serial will be used " - "(if enabled)\n\n"; + << "\t[Required] Backend: the available backends are:\n" +#ifdef KOKKOS_ENABLE_THREADS + << "\t\t'--threads [numThreads]'\n" +#endif +#ifdef KOKKOS_ENABLE_OPENMP + << "\t\t'--openmp [numThreads]'\n" +#endif +#ifdef KOKKOS_ENABLE_CUDA + << "\t\t'--cuda [deviceIndex]'\n" +#endif +#ifdef KOKKOS_ENABLE_HIP + << "\t\t'--hip [deviceIndex]'\n" +#endif +#ifdef KOKKOS_ENABLE_SYCL + << "\t\t'--sycl [deviceIndex]'\n" +#endif +#ifdef KOKKOS_ENABLE_SERIAL + << "\t\tIf no parallel backend is requested, Serial will be used.\n" +#endif + << "\n" + << "\t The following backends are not available because Kokkos was not " + "configured with them:\n" +#ifndef KOKKOS_ENABLE_THREADS + << "\t\t'--threads [numThreads]'\n" +#endif +#ifndef KOKKOS_ENABLE_OPENMP + << "\t\t'--openmp [numThreads]'\n" +#endif +#ifndef KOKKOS_ENABLE_CUDA + << "\t\t'--cuda [deviceIndex]'\n" +#endif +#ifndef KOKKOS_ENABLE_HIP + << "\t\t'--hip [deviceIndex]'\n" +#endif +#ifndef KOKKOS_ENABLE_SYCL + << "\t\t'--sycl [deviceIndex]'\n" +#endif +#ifndef KOKKOS_ENABLE_SERIAL + << "\t\tSerial is not enabled so a parallel backend must be selected.\n" +#endif + << "\n" + << "\t[Optional]:\n" + << "\t\t'-h', '--help': show available options\n\n"; return common_options.str(); } @@ -155,34 +191,42 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { // If e.g. params.use_cuda is 0, that means CUDA will not be used at all. // But if it's N, then it means run on CUDA device N-1. while (argIdx < argc) { - bool remove_flag = false; + // How many flags to delete from argc/argv + // 0: not a common option, so leave it + // 1: a bool parameter like '-h' + // 2: a parameter followed by a value, like "--cuda 0" + int remove_flags = 0; if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--openmp", params.use_openmp)) { - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { params.use_cuda++; - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--hip", params.use_hip)) { params.use_hip++; - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--sycl", params.use_sycl)) { params.use_sycl++; - remove_flag = true; + remove_flags = 2; } else if (check_arg_int(argIdx, argc, argv, "--repeat", params.repeat)) { - remove_flag = true; + remove_flags = 2; + } else if (check_arg_bool(argIdx, argc, argv, "-h", params.print_help) || + check_arg_bool(argIdx, argc, argv, "--help", + params.print_help)) { + remove_flags = 1; } - if (remove_flag) { - // Shift the remainder of the argv list by one. Note that argv has - // (argc + 1) arguments, the last one always being nullptr. The following - // loop moves the trailing nullptr element as well - for (int k = argIdx; k < argc - 1; ++k) { - argv[k] = argv[k + 2]; - argv[k + 1] = argv[k + 3]; + if (remove_flags) { + // Shift the remainder of the argv list left by the number of flags + // removed. Note that argv has (argc + 1) arguments, the last one always + // being nullptr. The following loop moves the trailing nullptr element + // as well + for (int k = argIdx + remove_flags; k <= argc; ++k) { + argv[k - remove_flags] = argv[k]; } - argc = argc - 2; + argc -= remove_flags; } else { ++argIdx; } diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index 314439b6c0..f3eb0dd8ac 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -117,8 +117,7 @@ struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct FactorizeModeAndAlgo - : FactorizeModeAndAlgoDeviceImpl {}; +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; #endif template @@ -156,8 +155,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo - : SolveModeAndAlgoDeviceImpl {}; +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif template diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 3f15ca0b2d..67a141578e 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -127,7 +127,7 @@ struct InverseDiagonalsModeAndAlgo #if defined(KOKKOS_ENABLE_HIP) template <> -struct InverseDiagonalsModeAndAlgo +struct InverseDiagonalsModeAndAlgo : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif @@ -166,8 +166,7 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #if defined(KOKKOS_ENABLE_HIP) template <> -struct SolveModeAndAlgo - : SolveModeAndAlgoDeviceImpl {}; +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { diff --git a/perf_test/batched/sparse/CMakeLists.txt b/perf_test/batched/sparse/CMakeLists.txt index 76a25d9938..b4f3c31f31 100644 --- a/perf_test/batched/sparse/CMakeLists.txt +++ b/perf_test/batched/sparse/CMakeLists.txt @@ -3,6 +3,15 @@ ADD_SUBDIRECTORY(cusolver) ADD_SUBDIRECTORY(GMRES) ADD_SUBDIRECTORY(SPMV) -FILE(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/scripts/binary_dir.txt -"${CMAKE_CURRENT_BINARY_DIR}" -) \ No newline at end of file +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_CG.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/scripts/run_CG.sh +) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_GMRES.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/scripts/run_GMRES.sh +) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_SPMV.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/scripts/run_SPMV.sh +) diff --git a/perf_test/batched/sparse/scripts/run_CG.sh b/perf_test/batched/sparse/scripts/run_CG.sh deleted file mode 100755 index fc740b0a77..0000000000 --- a/perf_test/batched/sparse/scripts/run_CG.sh +++ /dev/null @@ -1,3 +0,0 @@ -exe_path=$(head -n 1 "binary_dir.txt") - -${exe_path}/CG/KokkosBatched_Test_CG -A ../data/A.mm -B ../data/B.mm -X ../output/X_CG -timers ../output/timers_CG -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_CG.sh.in b/perf_test/batched/sparse/scripts/run_CG.sh.in new file mode 100755 index 0000000000..d3b45fc5b6 --- /dev/null +++ b/perf_test/batched/sparse/scripts/run_CG.sh.in @@ -0,0 +1 @@ +@CMAKE_CURRENT_BINARY_DIR@/CG/KokkosBatched_Test_CG -A ../data/A.mm -B ../data/B.mm -X ../output/X_CG -timers ../output/timers_CG -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_GMRES.sh b/perf_test/batched/sparse/scripts/run_GMRES.sh deleted file mode 100755 index e26ab2aa15..0000000000 --- a/perf_test/batched/sparse/scripts/run_GMRES.sh +++ /dev/null @@ -1,3 +0,0 @@ -exe_path=$(head -n 1 "binary_dir.txt") - -${exe_path}/GMRES/KokkosBatched_Test_GMRES -A ../data/A.mm -B ../data/B.mm -X ../output/X_GMRES -timers ../output/timers_GMRES -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_GMRES.sh.in b/perf_test/batched/sparse/scripts/run_GMRES.sh.in new file mode 100755 index 0000000000..b2e9e4174f --- /dev/null +++ b/perf_test/batched/sparse/scripts/run_GMRES.sh.in @@ -0,0 +1 @@ +@CMAKE_CURRENT_BINARY_DIR@/GMRES/KokkosBatched_Test_GMRES -A ../data/A.mm -B ../data/B.mm -X ../output/X_GMRES -timers ../output/timers_GMRES -n1 10 -n2 100 -team_size -1 -implementation 0 -l -n_iterations 20 -tol 1e-8 -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_SPMV.sh b/perf_test/batched/sparse/scripts/run_SPMV.sh deleted file mode 100755 index d4edd993aa..0000000000 --- a/perf_test/batched/sparse/scripts/run_SPMV.sh +++ /dev/null @@ -1,3 +0,0 @@ -exe_path=$(head -n 1 "binary_dir.txt") - -${exe_path}/SPMV/KokkosBatched_Test_SPMV -A ../data/A.mm -B ../data/B.mm -X ../output/X_SPMV -timers ../output/timers_SPMV -n1 10 -n2 100 -team_size -1 -implementation 3 -l -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/batched/sparse/scripts/run_SPMV.sh.in b/perf_test/batched/sparse/scripts/run_SPMV.sh.in new file mode 100755 index 0000000000..2c9fabe547 --- /dev/null +++ b/perf_test/batched/sparse/scripts/run_SPMV.sh.in @@ -0,0 +1 @@ +@CMAKE_CURRENT_BINARY_DIR@/SPMV/KokkosBatched_Test_SPMV -A ../data/A.mm -B ../data/B.mm -X ../output/X_SPMV -timers ../output/timers_SPMV -n1 10 -n2 100 -team_size -1 -implementation 3 -l -vector_length 8 -N_team 8 \ No newline at end of file diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 499a701c13..7bc25a5704 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -210,7 +210,7 @@ int main(int argc, char** argv) { } if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) - run(params.m, params.n, params.repeat); + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index 1e537ceadc..fca3030763 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -42,6 +42,8 @@ //@HEADER */ +#include + #include #include diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 89680d20f9..54ae35ac7a 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -207,7 +207,7 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) - run(params.m, params.repeat); + run(params.m, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index 14957994d1..c03cbb12ad 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -42,6 +42,8 @@ //@HEADER */ +#include + #include #include diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index 564db4af2e..5dfecd9015 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -211,11 +211,9 @@ int main(int argc, char** argv) { if (useHIP) { #if defined(KOKKOS_ENABLE_HIP) if (params.layoutLeft) - run(params.m, params.n, - params.repeat); + run(params.m, params.n, params.repeat); else - run(params.m, params.n, - params.repeat); + run(params.m, params.n, params.repeat); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 962328eb95..8f25026ba9 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -188,7 +188,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - run(params); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index 32d91e6b33..d617ffcdf3 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -180,7 +180,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - run(params); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index cbadcef0b1..de2db8dbb0 100644 --- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -21,7 +21,7 @@ #include -#include +#include #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" @@ -185,7 +185,7 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { for (int i = 0; i < options.start.a.k; ++i) { auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosLapack::trtri(&trtri_args.uplo, &trtri_args.diag, A); } // Fence after each batch operation Kokkos::fence(); @@ -196,7 +196,7 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { for (int i = 0; i < options.start.a.k; ++i) { auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosLapack::trtri(&trtri_args.uplo, &trtri_args.diag, A); } // Fence after each batch operation Kokkos::fence(); @@ -300,7 +300,7 @@ struct parallel_blas_trtri { void operator()(const int& i) const { auto svA = Kokkos::subview(trtri_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); + KokkosLapack::trtri(&trtri_args_.uplo, &trtri_args_.diag, svA); } }; #endif // !KOKKOS_ENABLE_CUDA && !KOKKOS_ENABLE_HIP && diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 57f241d7b1..134611739a 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -632,8 +632,8 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { KokkosKernels::Experiment::run_multi_mem_experiment< - size_type, idx, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + size_type, idx, Kokkos::HIP, Kokkos::HIPSpace, Kokkos::HIPSpace>( + params); } #endif diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index f05040c083..e4331dd542 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -708,8 +708,7 @@ int main(int argc, char* argv[]) { if (params.use_hip) { if (!use_multi_mem) { KokkosKernels::Experiment::experiment_driver< - kk_size_type, kk_lno_t, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace>(params); + kk_size_type, kk_lno_t, Kokkos::HIP, Kokkos::HIPSpace>(params); } } #endif diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index a97cbb4d81..8f7d6a1983 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -316,7 +316,7 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { - run_mis2(params); + run_mis2(params); run = true; } #endif diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index e9dc3f2f8e..d45eec48c4 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -132,8 +132,8 @@ struct RKSolve_wrapper { auto local_y_new = Kokkos::subview(y_new, Kokkos::pair(2 * idx, 2 * idx + 1)); auto local_tmp = Kokkos::subview(tmp, Kokkos::pair(2 * idx, 2 * idx + 1)); - auto local_kstack = Kokkos::subview( - kstack, Kokkos::pair(2 * idx, 2 * idx + 1), Kokkos::ALL()); + auto local_kstack = Kokkos::subview(kstack, Kokkos::ALL(), + Kokkos::pair(2 * idx, 2 * idx + 1)); // Run Runge-Kutta time integrator KokkosODE::Impl::RKSolve( @@ -178,7 +178,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { table_type table; ode_params params(num_steps); vec_type tmp("tmp vector", neqs * num_odes); - mv_type kstack("k stack", neqs * num_odes, table.nstages); + mv_type kstack("k stack", table.nstages, neqs * num_odes); // Set initial conditions vec_type y_new("solution", neqs * num_odes); @@ -230,7 +230,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { table_type table; ode_params params(num_steps); vec_type tmp("tmp vector", neqs * num_odes); - mv_type kstack("k stack", neqs * num_odes, table.nstages); + mv_type kstack("k stack", table.nstages, neqs * num_odes); // Set initial conditions vec_type y_new("solution", neqs * num_odes); diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 263f59671a..8a994b4122 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -135,4 +135,14 @@ if (KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( sparse_spmv_benchmark SOURCES KokkosSparse_spmv_benchmark.cpp ) + + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_spmv_bsr_benchmark SOURCES KokkosSparse_spmv_bsr_benchmark.cpp + ) + + # hipcc 5.2 has an underlying clang that has the std::filesystem + # in an experimental namespace and a different library + if (Kokkos_CXX_COMPILER_ID STREQUAL HIPCC AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 5.3) + target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) + endif() endif() diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index c11c6bdc02..163fdb2dd1 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -53,6 +53,7 @@ struct GS_Parameters { int maxNnzPerLongRow = 2000; bool graph_symmetric = false; int sweeps = 1; + int nstreams = 1; GSAlgorithm algo = GS_DEFAULT; GSDirection direction = GS_FORWARD; // Point: @@ -187,78 +188,174 @@ void runGS(const GS_Parameters& params) { Kokkos::finalize(); exit(1); } + std::vector instances; // size_type nnz = A.nnz(); - KernelHandle kh; + std::vector kh(params.nstreams); // use a random RHS - uniformly distributed over (-5, 5) - scalar_view_t b("b", nrows); - { - srand(54321); - auto bhost = Kokkos::create_mirror_view(b); - for (lno_t i = 0; i < nrows; i++) { - bhost(i) = 10.0 * rand() / RAND_MAX - 5.0; - } - Kokkos::deep_copy(b, bhost); - } - double bnorm = KokkosBlas::nrm2(b); + std::vector b(params.nstreams); // initial LHS is 0 - scalar_view_t x("x", nrows); + std::vector x(params.nstreams); + // Extract diagonal blocks of CRS matrix + std::vector DiagBlks(params.nstreams); // how long symbolic/numeric phases take (the graph reuse case isn't that // interesting since numeric doesn't do much) Kokkos::Timer timer; - // cluster size of 1 is standard multicolor GS - if (params.algo == GS_DEFAULT) { - kh.create_gs_handle(); - kh.get_point_gs_handle()->set_long_row_threshold(params.longRowThreshold); - } else if (params.algo == GS_CLUSTER) { - kh.create_gs_handle(params.coarse_algo, params.cluster_size); - } else { - kh.create_gs_handle(params.algo); - if (params.algo == GS_TWOSTAGE) kh.set_gs_twostage(!params.classic, nrows); + + { + namespace KE = Kokkos::Experimental; + auto ns = params.nstreams; + auto es = exec_space(); + std::vector weights(ns); + std::fill(weights.begin(), weights.end(), 1); + instances = KE::partition_space(es, weights); + } + + double blockExtractionTime = 0, symbolicLaunchTimeTotal = 0, + symbolicComputeTimeTotal = 0, numericLaunchTimeTotal = 0, + numericComputeTimeTotal = 0, applyLaunchTimeTotal = 0, + applyComputeTimeTotal = 0; + + timer.reset(); + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, + DiagBlks); + Kokkos::fence(); + blockExtractionTime = timer.seconds(); + + /////////////////// Handle creation /////////////////// + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); + auto blk_ncols = blk_A.numCols(); + if (blk_nrows != blk_ncols) { + cout << "ERROR: Gauss-Seidel only works for square matrices\n"; + Kokkos::finalize(); + exit(1); + } + b[i] = scalar_view_t("b[" + std::to_string(i) + "]", blk_nrows); + x[i] = scalar_view_t("x[" + std::to_string(i) + "]", blk_nrows); + { + srand(54321 + i); + auto bhost = Kokkos::create_mirror_view(b[i]); + for (lno_t row_id = 0; row_id < blk_nrows; row_id++) { + bhost(row_id) = 10.0 * rand() / RAND_MAX - 5.0; + } + Kokkos::deep_copy(instances[i], b[i], bhost); + } + // cluster size of 1 is standard multicolor GS + if (params.algo == GS_DEFAULT) { + kh[i].create_gs_handle(instances[i], params.nstreams); + kh[i].get_point_gs_handle()->set_long_row_threshold( + params.longRowThreshold); + } else if (params.algo == GS_CLUSTER) { + kh[i].create_gs_handle(params.coarse_algo, params.cluster_size); + } else { + kh[i].create_gs_handle(params.algo); + if (params.algo == GS_TWOSTAGE) + kh[i].set_gs_twostage(!params.classic, blk_nrows); + } + } + + /////////////////// Symbolic ///////////////// + timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); + KokkosSparse::Experimental::gauss_seidel_symbolic( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, params.graph_symmetric); + } + symbolicLaunchTimeTotal = timer.seconds(); + timer.reset(); + Kokkos::fence(); + symbolicComputeTimeTotal = timer.seconds(); + + /////////////////// Numeric ///////////////// + timer.reset(); + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); + KokkosSparse::Experimental::gauss_seidel_numeric( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, params.graph_symmetric); } + numericLaunchTimeTotal = timer.seconds(); timer.reset(); - KokkosSparse::Experimental::gauss_seidel_symbolic( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, - params.graph_symmetric); - double symbolicTime = timer.seconds(); - std::cout << "\n*** Symbolic time: " << symbolicTime << '\n'; + Kokkos::fence(); + numericComputeTimeTotal = timer.seconds(); + + /////////////////// Apply ///////////////// timer.reset(); - KokkosSparse::Experimental::gauss_seidel_numeric( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, - params.graph_symmetric); - double numericTime = timer.seconds(); - std::cout << "\n*** Numeric time: " << numericTime << '\n'; + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); + // Last two parameters are damping factor (should be 1) and sweeps + switch (params.direction) { + case GS_SYMMETRIC: + KokkosSparse::Experimental::symmetric_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); + break; + case GS_FORWARD: + KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); + break; + case GS_BACKWARD: + KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply( + instances[i], &kh[i], blk_nrows, blk_nrows, blk_A.graph.row_map, + blk_A.graph.entries, blk_A.values, x[i], b[i], true, true, 1.0, + params.sweeps); + break; + } + } + applyLaunchTimeTotal = timer.seconds(); + timer.reset(); + Kokkos::fence(); + applyComputeTimeTotal = timer.seconds(); timer.reset(); - // Last two parameters are damping factor (should be 1) and sweeps - switch (params.direction) { - case GS_SYMMETRIC: - KokkosSparse::Experimental::symmetric_gauss_seidel_apply( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, - true, true, 1.0, params.sweeps); - break; - case GS_FORWARD: - KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, - true, true, 1.0, params.sweeps); - break; - case GS_BACKWARD: - KokkosSparse::Experimental::backward_sweep_gauss_seidel_apply( - &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, x, b, - true, true, 1.0, params.sweeps); - break; + + for (int i = 0; i < params.nstreams; i++) { + auto blk_A = DiagBlks[i]; + auto blk_nrows = blk_A.numRows(); + kh[i].destroy_gs_handle(); + // Now, compute the 2-norm of residual + scalar_view_t res("Ax-b", blk_nrows); + Kokkos::deep_copy(instances[i], res, b[i]); + double bnorm = KokkosBlas::nrm2(instances[i], b[i]); + scalar_t alpha = Kokkos::reduction_identity::prod(); + scalar_t beta = -alpha; + KokkosSparse::spmv(instances[i], "N", alpha, blk_A, x[i], + beta, res); + double resnorm = KokkosBlas::nrm2(instances[i], res); + // note: this still works if the solution diverges + std::cout << "StreamID(" << i << "): Relative res norm: " << resnorm / bnorm + << '\n'; } - double applyTime = timer.seconds(); - std::cout << "\n*** Apply time: " << applyTime << '\n'; - kh.destroy_gs_handle(); - // Now, compute the 2-norm of residual - scalar_view_t res("Ax-b", nrows); - Kokkos::deep_copy(res, b); - scalar_t alpha = Kokkos::reduction_identity::prod(); - scalar_t beta = -alpha; - KokkosSparse::spmv("N", alpha, A, x, beta, res); - double resnorm = KokkosBlas::nrm2(res); - // note: this still works if the solution diverges - std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; + std::cout << "\n*** Total block extraction time: " << blockExtractionTime + << '\n'; + std::cout << "\n*** Total Symbolic launch time: " << symbolicLaunchTimeTotal + << '\n'; + std::cout << "*** Total Symbolic compute time: " << symbolicComputeTimeTotal + << '\n'; + std::cout << "\n*** Total Numeric launch time: " << numericLaunchTimeTotal + << '\n'; + std::cout << "*** Total Numeric compute time: " << numericComputeTimeTotal + << '\n'; + std::cout << "\n*** Total Apply launch time: " << applyLaunchTimeTotal + << '\n'; + std::cout << "*** Total Apply compute time: " << applyComputeTimeTotal + << '\n'; + double launchTimeTotal = + symbolicLaunchTimeTotal + numericLaunchTimeTotal + applyLaunchTimeTotal; + std::cout << "\n*** Total launch time: " << launchTimeTotal << '\n'; + double computeTimeTotal = symbolicComputeTimeTotal + numericComputeTimeTotal + + applyComputeTimeTotal; + std::cout << "*** Total compute time: " << computeTimeTotal << '\n'; + std::cout << "\n*** Total compute and launch time: " + << launchTimeTotal + computeTimeTotal << '\n'; } int main(int argc, char** argv) { @@ -274,6 +371,8 @@ int main(int argc, char** argv) { "symmetric.\n"; cout << " : if generating matrix randomly, it is symmetrized\n"; cout << "--sweeps S: run S times (default 1)\n"; + cout << "--streams N: partition matrix and run across N streams (default " + "1)\n"; cout << "Randomized matrix settings, if not reading from file:\n"; cout << " --n : number of rows/columns\n"; cout << " --nnz : number of nonzeros in each regular row\n"; @@ -340,6 +439,8 @@ int main(int argc, char** argv) { params.direction = GS_BACKWARD; else if (!strcmp(argv[i], "--sweeps")) params.sweeps = atoi(getNextArg(i, argc, argv)); + else if (!strcmp(argv[i], "--streams")) + params.nstreams = atoi(getNextArg(i, argc, argv)); else if (!strcmp(argv[i], "--point")) params.algo = GS_DEFAULT; else if (!strcmp(argv[i], "--cluster")) diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 285cf026b4..3f4893363a 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -56,6 +56,15 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, } numRows = A.numRows(); numCols = A.numCols(); + + std::cout << "A is " << numRows << "x" << numCols << ", with " << A.nnz() + << " nonzeros\n"; + std::cout << "SpMV mode " << mode << ", " << num_vecs + << " vectors, beta = " << beta << ", multivectors are "; + std::cout << (std::is_same_v ? "LayoutLeft" + : "LayoutRight"); + std::cout << '\n'; + mv_type x("X", numCols, num_vecs); mv_type y("Y", numRows, num_vecs); h_mv_type h_x = Kokkos::create_mirror_view(x); @@ -73,6 +82,17 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, // Benchmark auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); + // Do 5 warm up calls (not timed) + for (int i = 0; i < 5; i++) { + if (num_vecs == 1) { + // run the rank-1 version + KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0); + } else { + // rank-2 + KokkosSparse::spmv(&mode, 1.0, A, x, beta, y); + } + Kokkos::DefaultExecutionSpace().fence(); + } Kokkos::Timer timer; for (int i = 0; i < loop; i++) { if (num_vecs == 1) { @@ -169,9 +189,6 @@ int main(int argc, char** argv) { Kokkos::initialize(argc, argv); - std::cout << size << " rows/cols, mode " << mode << ", " << num_vecs - << " vectors, beta = " << beta << ", layout " << layout << ": "; - if (layout == 'L') run_spmv(size, size, filename, loop, num_vecs, mode, beta); diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp index ef144f2817..52191a47f9 100644 --- a/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -71,9 +71,6 @@ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; using float_t = typename Kokkos::ArithTraits::mag_type; -static constexpr bool IS_GPU = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - /////////////////////////////////////////////////////////////////////////////// void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, const sp_matrix_type& A, int& num_iters) @@ -132,6 +129,9 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, #ifdef USE_GINKGO /////////////////////////////////////////////////////////////////////////////// +static constexpr bool IS_GPU = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + using ginkgo_exec = std::conditional_t; diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 475bfe5f85..9825f7c90d 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -366,8 +366,7 @@ int main(int argc, char **argv) { if (cmdline[CMD_USE_CUDA]) run_pcg(cmdline, mtx_file); #endif #if defined(KOKKOS_ENABLE_HIP) - if (cmdline[CMD_USE_HIP]) - run_pcg(cmdline, mtx_file); + if (cmdline[CMD_USE_HIP]) run_pcg(cmdline, mtx_file); #endif } Kokkos::finalize(); diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index e8a0b19419..3b347eb903 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -90,7 +90,6 @@ void print_options() { } int parse_inputs(LocalParams& params, int argc, char** argv) { - bool printHelp = false; bool discard; for (int i = 1; i < argc; ++i) { // if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { @@ -131,8 +130,6 @@ int parse_inputs(LocalParams& params, int argc, char** argv) { ++i; } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", params.verbose)) { - } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { - } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -140,10 +137,6 @@ int parse_inputs(LocalParams& params, int argc, char** argv) { return 1; } } - if (printHelp) { - print_options(); - return 1; - } return 0; } @@ -192,6 +185,31 @@ void run_experiment(int argc, char** argv, CommonInputParams) { "If running MKL, can't output the result to file"); } + // Check that offset/ordinal types are compatible with any requested TPLs +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (params.use_mkl) { + if constexpr (!std::is_same_v) { + throw std::runtime_error( + "MKL configured with long long int not supported in Kokkos Kernels"); + } + if constexpr (!std::is_same_v || + !std::is_same_v) { + throw std::runtime_error( + "Must enable int as both ordinal and offset type in KokkosKernels to " + "call MKL SpAdd"); + } + } +#endif + + if (params.use_cusparse) { + if constexpr (!std::is_same_v || + !std::is_same_v) { + throw std::runtime_error( + "Must enable int as both ordinal and offset type in KokkosKernels to " + "call cuSPARSE SpAdd"); + } + } + std::cout << "************************************* \n"; crsMat_t A; crsMat_t B; @@ -326,9 +344,11 @@ void run_experiment(int argc, char** argv, CommonInputParams) { } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - sparse_matrix_t Amkl, Bmkl, Cmkl; + sparse_matrix_t Amkl = sparse_matrix_t(), Bmkl = sparse_matrix_t(), + Cmkl = sparse_matrix_t(); if (params.use_mkl) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v && + std::is_same_v) { KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), @@ -337,9 +357,6 @@ void run_experiment(int argc, char** argv, CommonInputParams) { &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); - } else { - throw std::runtime_error( - "MKL configured with long long int not supported in Kokkos Kernels"); } } #endif @@ -354,22 +371,30 @@ void run_experiment(int argc, char** argv, CommonInputParams) { c_nnz = addHandle->get_c_nnz(); } else if (params.use_cusparse) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // Symbolic phase: compute buffer size, then compute nnz - size_t bufferSize; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt( - cusparseHandle, A.numRows(), A.numCols(), &alphabeta, A_cusparse, - A.nnz(), A.values.data(), A.graph.row_map.data(), - A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(), - B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), - C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize)); - // Allocate work buffer - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaMalloc((void**)&cusparseBuffer, bufferSize)); - KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( - cusparseHandle, m, n, A_cusparse, A.nnz(), A.graph.row_map.data(), - A.graph.entries.data(), B_cusparse, B.nnz(), B.graph.row_map.data(), - B.graph.entries.data(), C_cusparse, row_mapC.data(), &c_nnz, - cusparseBuffer)); + if constexpr (std::is_same_v && + std::is_same_v) { + // Symbolic phase: compute buffer size, then compute nnz + size_t bufferSize; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt( + cusparseHandle, A.numRows(), A.numCols(), &alphabeta, A_cusparse, + A.nnz(), A.values.data(), A.graph.row_map.data(), + A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(), + B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), + C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize)); + // Allocate work buffer + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc((void**)&cusparseBuffer, bufferSize)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( + cusparseHandle, m, n, A_cusparse, A.nnz(), A.graph.row_map.data(), + A.graph.entries.data(), B_cusparse, B.nnz(), B.graph.row_map.data(), + B.graph.entries.data(), C_cusparse, row_mapC.data(), &c_nnz, + cusparseBuffer)); + } else { + throw std::runtime_error( + "Must enable int as both ordinal and offset type in KokkosKernels " + "to " + "call cuSPARSE"); + } #endif } if (!params.use_mkl) { @@ -388,24 +413,32 @@ void run_experiment(int argc, char** argv, CommonInputParams) { for (int numericRep = 0; numericRep < params.numericRepeat; numericRep++) { if (params.use_cusparse) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2( - cusparseHandle, m, n, &alphabeta, A_cusparse, A.nnz(), - A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), - &alphabeta, B_cusparse, B.nnz(), B.values.data(), - B.graph.row_map.data(), B.graph.entries.data(), C_cusparse, - valuesC.data(), row_mapC.data(), entriesC.data(), cusparseBuffer)); + if constexpr (std::is_same_v && + std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2( + cusparseHandle, m, n, &alphabeta, A_cusparse, A.nnz(), + A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), + &alphabeta, B_cusparse, B.nnz(), B.values.data(), + B.graph.row_map.data(), B.graph.entries.data(), C_cusparse, + valuesC.data(), row_mapC.data(), entriesC.data(), + cusparseBuffer)); + } #endif } else if (params.use_mkl) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add( - SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + if constexpr (std::is_same_v && + std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add( + SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + } #endif } else { - spadd_numeric( - &kh, A.graph.row_map, A.graph.entries, A.values, 1.0, // A, alpha - B.graph.row_map, B.graph.entries, B.values, 1.0, // B, beta - row_mapC, entriesC, valuesC); // C + spadd_numeric(&kh, A.graph.row_map, A.graph.entries, A.values, + 1.0, // A, alpha + B.graph.row_map, B.graph.entries, B.values, + 1.0, // B, beta + row_mapC, entriesC, valuesC); // C } } numericTime += timer.seconds(); diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index cee68ef11a..2d03be80ac 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -158,7 +158,6 @@ void print_options() { int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** argv) { std::string algoStr; - bool printHelp; for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "--repeat", params.repeat)) { ++i; @@ -276,8 +275,6 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } ++i; - } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { - } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -285,10 +282,6 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } } - if (printHelp) { - print_options(); - return 1; - } return 0; } diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp new file mode 100644 index 0000000000..770b09cfb1 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp @@ -0,0 +1,476 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file KokkosSparse_spmv_bsr_benchmark.cpp + + Read a matrix market file, choose a block size and a number of multivectors, + and compare Bsr SpMV implementations +*/ + +#include +#include + +#include + +/* Some versions of clang that hipcc is basedoff of haven't stabilized + * std::filesystem yet */ +#if defined(KOKKOS_ENABLE_HIP) && __HIPCC__ +#include +namespace fs = std::experimental::filesystem; +#else +#include +namespace fs = std::filesystem; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +#include +#endif + +#include + +#include + +#include + +#include "Benchmark_Utils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_crs_to_bsr_impl.hpp" +#include "KokkosSparse_crs_detect_block_size.hpp" + +using namespace KokkosKernelsBenchmark; + +/* Since benchmarks have to be defined before they are executed, the file IO + for each benchmark needs to be in the execution itself, otherwise every + matrix would have to be resident in memory before any benchmark can run. + + If multiple benchmarks need the same file, it would be read over and over + again. This is especially painful on network file systems, so this executable + has a global cache to store the most recently-read matrix. + + Despite that the matrix is always read with the same precision, we don't + know the Device at this time, so we can't define the value type of the cache + yet. Instead, we'll erase the type, and use a pointer to void. The cache will + be keyed on a combination of the path and the requested type, so we know if + the actual CrsMatrix behind the void pointer matches the requested type or + not +*/ +using Key = std::tuple; +using Val = std::shared_ptr; // type-erased Crs matrix (since we don't + // know the template params) +static Key CACHE_KEY = {"", std::type_index(typeid(void))}; +static Val CACHE_VAL = nullptr; + +// This can be called before Kokkos::finalize to kill the matrix that is living +// in the cache +void drop_cache() { + CACHE_KEY = {"", std::type_index(typeid(void))}; + CACHE_VAL = nullptr; +} + +/// cache repeated reads to \c path +template +Crs cached_read(const fs::path &path) { + // check if the cached matrix is a Crs from path + const Key key(path, std::type_index(typeid(Crs))); + + // if this is not the cached matrix, overwrite the cache + if (CACHE_KEY != key) { + CACHE_KEY = key; + CACHE_VAL = std::make_shared( + KokkosSparse::Impl::read_kokkos_crst_matrix(path.c_str())); + } + + // the Crs type is part of the key, so we know this cast is safe + return *std::static_pointer_cast(CACHE_VAL); +} + +/* Cache a map of path -> matrix block size so that scanning the matrix to + * register the benchmark and then actually running the becnchmark don't both + * need to run the matrix */ +template +size_t detect_block_size(const fs::path &path) { + using ReadScalar = double; + using ReadOrdinal = int64_t; + using ReadOffset = uint64_t; + using Crs = KokkosSparse::CrsMatrix; + + static std::map cache; + + if (0 == cache.count(path)) { + std::cerr << "read " << path << "...\n"; + const Crs crs = cached_read(path); + size_t detectedSize = KokkosSparse::Impl::detect_block_size(crs); + std::cerr << "detected block size = " << detectedSize << "\n"; + cache[path] = detectedSize; + } + return cache.at(path); +} + +// Test that y_act is close to y_exp. +// This needs the matrix, alpha, and beta to compute the error tolerance +// properly +template +void check_correctness(benchmark::State &state, const View &y_exp, + const View &y_act, const Matrix &crs, const Alpha &alpha, + const Beta &beta, const DieOnError &die, + const SkipOnError &skip) { + using execution_space = typename View::execution_space; + using scalar_type = typename View::non_const_value_type; + using AT = Kokkos::ArithTraits; + using mag_type = typename AT::mag_type; + using ATM = Kokkos::ArithTraits; + + // max value in A + mag_type maxA = 0; + Kokkos::parallel_reduce( + "maxA", Kokkos::RangePolicy(0, crs.nnz()), + KOKKOS_LAMBDA(const int &i, mag_type &lmax) { + mag_type v = AT::abs(crs.values(i)); + lmax = lmax > v ? lmax : v; + }, + maxA); + + double eps = AT::epsilon(); + const double max_val = + AT::abs(beta * 1.0 + crs.numCols() * alpha * maxA * 1.0); + + auto h_exp = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_exp); + auto h_act = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y_act); + + size_t err = 0; + std::vector> errIdx; + for (size_t i = 0; i < h_exp.extent(0); ++i) { + for (size_t k = 0; k < h_exp.extent(1); ++k) { + const mag_type error = ATM::abs(h_exp(i, k) - h_act(i, k)); + if (error > eps * max_val) { + ++err; + errIdx.push_back({i, k}); + } + } + } + if (err > 0) { + size_t errLimit = 100; // how many errors to print + std::cerr << "first " << errLimit << " errors...\n"; + std::cerr << "i\tk\texp\tact" << std::endl; + std::cerr << "-\t-\t---\t---" << std::endl; + for (auto [i, k] : errIdx) { + std::cerr << i << "\t" << k << "\t" << h_exp(i, k) << "\t" << h_act(i, k) + << std::endl; + if (0 == --errLimit) { + break; + } + } + std::cerr << __FILE__ << ":" << __LINE__ << ": ERROR: correctness failed " + << std::endl; + std::cerr << __FILE__ << ":" << __LINE__ << ": threshold was " + << eps * max_val << std::endl; + + if (die) { + exit(EXIT_FAILURE); + } else if (skip) { + state.SkipWithError("correctness check failed"); + } + } +} + +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvDefault { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + return KokkosSparse::spmv(mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "default"; } +}; + +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvNative { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "native"); + return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "native"; } +}; + +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvV41 { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "v4.1"); + return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "v4.1"; } +}; + +template +void run(benchmark::State &state, const Bsr &bsr, const size_t k) { + using execution_space = typename Bsr::execution_space; + using memory_space = typename Bsr::memory_space; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using size_type = typename Bsr::non_const_size_type; + + // multivector should be layoutleft for CPU, makes + // slices of a single vector contiguous + using view_t = Kokkos::View; + + state.counters["nnz"] = bsr.nnz() * bsr.blockDim() * bsr.blockDim(); + state.counters["num_rows"] = bsr.numRows() * bsr.blockDim(); + state.counters["block_size"] = bsr.blockDim(); + state.counters["num_vecs"] = k; + + view_t y_init("y_init", bsr.numRows() * bsr.blockDim(), k); + view_t y_exp("ye", bsr.numRows() * bsr.blockDim(), k); + view_t y_act("ya", bsr.numRows() * bsr.blockDim(), k); + view_t x("x", bsr.numCols() * bsr.blockDim(), k); + + Kokkos::Random_XorShift64_Pool random_pool(12345); + fill_random(y_init, random_pool, 0.0, 1.0); + fill_random(x, random_pool, 0.0, 1.0); + scalar_type alpha{1.17}; + scalar_type beta{-0.3}; + + Kokkos::deep_copy(y_act, y_init); + Kokkos::deep_copy(y_exp, y_init); + + const char *mode = KokkosSparse::NoTranspose; + + // test the SpMV against whatever the default is + Spmv::spmv(mode, alpha, bsr, x, beta, y_act); + Kokkos::fence(); + KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp); + Kokkos::fence(); + + check_correctness(state, y_exp, y_act, bsr, alpha, beta, DieOnError(false), + SkipOnError(true)); + + Kokkos::fence(); + for (auto _ : state) { + Spmv::spmv(mode, alpha, bsr, x, beta, y_exp); + Kokkos::fence(); + } + + const size_t bytesPerSpmv = + bsr.nnz() * bsr.blockDim() * bsr.blockDim() * + sizeof(scalar_type) // A values + + bsr.nnz() * sizeof(ordinal_type) // A col indices + + (bsr.numRows() + 1) * sizeof(size_type) // A row-map + + 2 * bsr.numRows() * bsr.blockDim() * k * + sizeof(scalar_type) // load / store y + + bsr.numCols() * bsr.blockDim() * k * sizeof(scalar_type) // load x + ; + + state.SetBytesProcessed(bytesPerSpmv * state.iterations()); +} + +template +void read_expand_run(benchmark::State &state, const fs::path &path, + const size_t blockSize, const size_t k) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + + // read Crs into host memory + using Crs = + KokkosSparse::CrsMatrix; + + const Crs crs = cached_read(path); + Bsr bsr; + try { + bsr = KokkosSparse::Impl::expand_crs_to_bsr(crs, blockSize); + } catch (std::exception &e) { + state.SkipWithError(e.what()); + return; + } + + run(state, bsr, k); +} + +template +void read_convert_run(benchmark::State &state, const fs::path &path, + const size_t blockSize, const size_t k) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + + using Crs = + KokkosSparse::CrsMatrix; + + const Crs crs = cached_read(path); + Bsr bsr; + try { + bsr = KokkosSparse::Impl::blocked_crs_to_bsr(crs, blockSize); + } catch (std::exception &e) { + state.SkipWithError(e.what()); + return; + } + + run(state, bsr, k); +} + +template +void register_expand_type(const fs::path &path) { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + std::vector ks = {1, 3}; + for (size_t bs : {4, 7, 10, 16}) { // block sizes + for (size_t k : ks) { // multivector sizes + std::string name = + std::string("MatrixMarketExpanded") + "/" + std::string(path.stem()) + + "/" + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + std::to_string(bs) + "/" + + std::to_string(k) + "/" + Spmv::name() + "/" + Device::name(); + benchmark::RegisterBenchmark(name.c_str(), read_expand_run, + path, bs, k) + ->UseRealTime(); + } + } +} + +template +void register_convert_type(const fs::path &path, size_t bs) { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + std::vector ks = {1, 3}; + + for (size_t k : ks) { // multivector sizes + std::string name = + std::string("MatrixMarketConvert") + "/" + std::string(path.stem()) + + "/" + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + + Kokkos::ArithTraits::name() + "/" + std::to_string(bs) + "/" + + std::to_string(k) + "/" + Spmv::name() + "/" + Device::name(); + benchmark::RegisterBenchmark(name.c_str(), read_convert_run, + path, bs, k) + ->UseRealTime(); + } +} + +template +void register_converts(const fs::path &path, const size_t bs) { + std::cerr << "benchmarks will use detected blocksize\n"; + // clang-format off + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + // clang-format on +} + +template +void register_expands(const fs::path &path) { + std::cerr << "benchmarks will expand each non-zero into a larger block\n"; + // clang-format off + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + // clang-format on +} + +template +void register_path(const fs::path &path) { + size_t detectedSize; + try { + detectedSize = detect_block_size(path); + } catch (const std::exception &e) { + std::cerr << "ERROR while reading: " << e.what() << "\n" + << "skipping!\n"; + return; + } + + /* If a block size can be detected, just use that block size without + expanding the matrix. + Otherwise, expand the matrix to some arbitrary block sizes to test BSR + */ + if (detectedSize != 1) { + register_converts(path, detectedSize); + } else { + register_expands(path); + } +} + +int main(int argc, char **argv) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMicrosecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + for (int i = 1; i < argc; ++i) { +#if defined(KOKKOS_ENABLE_CUDA) + register_path(argv[i]); +#endif +#if defined(KOKKOS_ENABLE_HIP) + register_path(argv[i]); +#endif +#if defined(KOKKOS_ENABLE_SERIAL) + register_path(argv[i]); +#endif + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + drop_cache(); + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/perf_test/sparse/KokkosSparse_spmv_merge.cpp index 3110223e3c..6ad772116e 100644 --- a/perf_test/sparse/KokkosSparse_spmv_merge.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_merge.cpp @@ -303,8 +303,10 @@ int main(int argc, char** argv) { for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { Kokkos::Timer timer; // KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - KokkosSparse::Impl::spmv_beta(controls, "N", alpha, test_matrix, x, + KokkosSparse::Impl::spmv_beta(Kokkos::DefaultExecutionSpace{}, + controls, "N", alpha, test_matrix, x, beta, y); Kokkos::fence(); double time = timer.seconds(); diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 1290b5814b..02fcd1640a 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -179,7 +179,8 @@ void struct_matvec(const int stencil_type, int64_t worksets_ext = (numInteriorPts + rows_per_team_ext - 1) / rows_per_team_ext; - KokkosSparse::Impl::SPMV_Struct_Functor + KokkosSparse::Impl::SPMV_Struct_Functor spmv_struct(structure, stencil_type, alpha, A, x, beta, y, rows_per_team_int, rows_per_team_ext); @@ -188,8 +189,10 @@ void struct_matvec(const int stencil_type, << ", vector_length=" << vector_length << std::endl; } - spmv_struct.compute_interior(worksets_int, team_size_int, vector_length); - spmv_struct.compute_exterior(worksets_ext, team_size_ext, vector_length); + spmv_struct.compute_interior(execution_space{}, worksets_int, team_size_int, + vector_length); + spmv_struct.compute_exterior(execution_space{}, worksets_ext, team_size_ext, + vector_length); } // struct_matvec @@ -210,8 +213,9 @@ void matvec(typename YVector::const_value_type& alpha, const AMatrix& A, A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length); int64_t worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team; - KokkosSparse::Impl::SPMV_Functor func( - alpha, A, x, beta, y, rows_per_team); + KokkosSparse::Impl::SPMV_Functor + func(alpha, A, x, beta, y, rows_per_team); if (print_lp) { std::cout << "worksets=" << worksets << ", team_size=" << team_size diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 98c5db89df..28ef93b004 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -177,11 +177,11 @@ if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name fi if [[ "$HOSTNAME" == fat* ]]; then # Caraway MI250 queues - MACHINE=caraway + MACHINE=vega90a_caraway fi if [[ "$HOSTNAME" == lean* ]]; then # Caraway MI210 queues - MACHINE=caraway + MACHINE=vega90a_caraway fi if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then @@ -698,8 +698,39 @@ elif [ "$MACHINE" = "caraway" ]; then if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA908" fi +elif [ "$MACHINE" = "vega90a_caraway" ]; then + SKIP_HWLOC=True + # BUILD_ONLY=True + # report_and_log_test_result: only testing compilation of code for now, + # output description and success based only on build succes; build time output (no run-time) + + BASE_MODULE_LIST="cmake,/" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + + HIPCLANG_BUILD_LIST="Hip_Serial" + HIPCLANG_WARNING_FLAGS="" + + if [ "$SPOT_CHECK_TPLS" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.6.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/5.6.1 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + ) + fi + + + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=VEGA90A" + fi elif [ "$MACHINE" = "blake" ]; then - MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 4e185f4c1b..714461bfe6 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:11.7.0-devel-ubuntu22.04 +ARG BASE=nvidia/cuda:11.7.1-devel-ubuntu22.04 FROM $BASE RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub diff --git a/sparse/impl/KokkosSparse_crs_detect_block_size.hpp b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp new file mode 100644 index 0000000000..418f2a74cc --- /dev/null +++ b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp @@ -0,0 +1,158 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP +#define KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP + +#include + +#include +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_Utils.hpp" + +/*! \file KokkosSparse_crs_detect_block_size.hpp + + \brief A utility function for detecting the block size in a CrsMatrix. Not + for performance-sensitive use. +*/ + +namespace KokkosSparse::Impl { + +/** + * \class BlockPopulations + * \brief A class to store population counts of blocks in a CrsMatrix + */ +class BlockPopulations { + public: + /** + * \brief Constructor for BlockPopulations + * \param sz The block size + */ + BlockPopulations(size_t sz) : sz_(sz) {} + + /** + * \brief Add a point to the corresponding block + * \param r The row index of the point + * \param c The column index of the point + */ + void add(size_t r, size_t c) { + auto key = std::make_pair(r / sz_, c / sz_); + auto it = blocks_.find(key); + if (it == blocks_.end()) { + blocks_.insert(std::make_pair(key, 1)); + } else { + ++(it->second); + } + } + + /** + * \brief Check if all blocks are dense + * \return True if all blocks have a count equal to the block size squared + */ + bool all_dense() const { + for (const auto &kv : blocks_) { + if (kv.second < sz_ * sz_) { + return false; + } + } + return true; + } + + private: + std::map, size_t> + blocks_; /**< A map of block coordinates to their population counts */ + size_t sz_; /**< The block size */ +}; + +/** + * @brief Detects the largest block size that yields only dense blocks in a + CrsMatrix + * + * @tparam Crs The type of the CRS matrix. + * @param crs The CRS matrix to detect the block size for. + * @return The largest block size that results in completely dense blocks + The smallest valid block size is 1 + Since blocks must be dense, sqrt(nnz), num rows, num cols, and min nnz/row + among non-empty rows are all easy upper bounds of the block size. + Block sizes are tested from 1 to the minimum of the above. + The matrix dimensions must divide evenly into a trial block size (otherwise a + block would not be full). Furthermore, if a block size of N is not dense, any + multiple of N will also not be dense, and can be skipped. This is because + blocks of 2N contain blocks of N, at least one of which is already known not to + be dense. In practice, this ends up testing only small composite factors and + all prime factors up to the upper bound. +*/ +template +size_t detect_block_size(const Crs &crs) { + using ordinal_type = typename Crs::ordinal_type; + + // copy matrix data to host + auto rs = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.row_map); + auto cs = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.entries); + + // upper bound is minimum of sqrt(nnz), numRows, numCols, + // and smallest non-empty row + size_t upperBound = std::sqrt(double(crs.nnz())); + upperBound = std::min(upperBound, size_t(crs.numRows())); + upperBound = std::min(upperBound, size_t(crs.numCols())); + for (size_t i = 1; i < rs.size(); ++i) { + size_t rowLen = rs(i) - rs(i - 1); + if (rowLen > 0) { + upperBound = std::min(upperBound, rowLen); + } + } + + // trial blocks sizes that didn't work out + std::vector rejectedSizes; + + size_t largestBlockSize = 1; // always a valid block size + for (size_t trialSize = 2; trialSize <= upperBound; ++trialSize) { + // trial size must be factor of rows / cols + if ((crs.numRows() % trialSize) || (crs.numCols() % trialSize)) { + rejectedSizes.push_back(trialSize); + continue; + } + + // trial size must not be a multiple of previously-rejected size + if (std::any_of(rejectedSizes.begin(), rejectedSizes.end(), + [&](size_t f) { return trialSize % f == 0; })) { + rejectedSizes.push_back(trialSize); + continue; + } + + // count the population of all blocks + BlockPopulations pops(trialSize); + for (ordinal_type row = 0; row < crs.numRows(); ++row) { + for (size_t ci = rs(row); ci < rs(row + 1); ++ci) { + ordinal_type col = cs(ci); + pops.add(row, col); + } + } + + // if all blocks are dense, this is the largest one so far + if (pops.all_dense()) { + largestBlockSize = trialSize; + } else { + rejectedSizes.push_back(trialSize); + } + } + return largestBlockSize; +} + +} // namespace KokkosSparse::Impl + +#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp index 8e4c187b99..7f1ff2171e 100644 --- a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -23,7 +23,21 @@ Bsr expand_crs_to_bsr(const Crs &crs, size_t blockSize) { using crs_row_map_type = typename Crs::row_map_type; using bsr_row_map_type = Kokkos::View; + bsr_device_type>; // need non-const version + + using bsr_size_type = typename Bsr::non_const_size_type; + + { + size_t nnz = crs.nnz() * blockSize * blockSize; + if (nnz > size_t(Kokkos::ArithTraits::max())) { + std::stringstream ss; + ss << "expanding " << crs.nnz() + << " non-zeros of CrsMatrix into blocks of " << blockSize + << " would overflow size_type of requested BsrMatrix " + << Kokkos::ArithTraits::name(); + throw std::runtime_error(ss.str()); + } + } // construct the Bsr row map bsr_row_map_type bsrRowMap("bsrRowMap", crs.graph.row_map.size()); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index e4cfb4b047..7391e00e3d 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -84,7 +84,7 @@ class PointGaussSeidel { typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy range_pol; + typedef Kokkos::RangePolicy range_policy_t; typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t; typedef typename HandleType::GraphColoringHandleType::color_t color_t; @@ -825,6 +825,8 @@ class PointGaussSeidel { void initialize_symbolic() { auto gsHandle = get_gs_handle(); const size_type longRowThreshold = gsHandle->get_long_row_threshold(); + const MyExecSpace my_exec_space = gsHandle->get_execution_space(); + const int num_streams = gsHandle->get_num_streams(); // Validate settings if (gsHandle->get_block_size() > 1 && longRowThreshold > 0) @@ -838,6 +840,7 @@ class PointGaussSeidel { #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif + // TODO: Pass my_exec_space into KokkosGraph kernels typename HandleType::GraphColoringHandleType::color_view_t colors; color_t numColors; { @@ -871,6 +874,9 @@ class PointGaussSeidel { colors = gchandle->get_vertex_colors(); numColors = gchandle->get_num_colors(); } + // Wait for coloring to finish on its stream + using ColoringExecSpace = typename HandleType::HandleExecSpace; + ColoringExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "COLORING_TIME:" << timer.seconds() << std::endl; timer.reset(); @@ -886,7 +892,8 @@ class PointGaussSeidel { for (int i = 0; i < num_rows; ++i) { h_colors(i) = i + 1; } - Kokkos::deep_copy(colors, h_colors); + Kokkos::deep_copy(my_exec_space, colors, h_colors); + my_exec_space.fence(); #endif nnz_lno_persistent_work_view_t color_xadj; nnz_lno_persistent_work_view_t color_adj; @@ -896,10 +903,10 @@ class PointGaussSeidel { KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, numColors, colors, color_xadj, color_adj); + my_exec_space, num_rows, numColors, colors, color_xadj, color_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -909,7 +916,7 @@ class PointGaussSeidel { Kokkos::deep_copy(h_color_xadj, color_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "DEEP_COPY:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -917,11 +924,11 @@ class PointGaussSeidel { // Count long rows per color set, and sort color sets so that long rows // come after regular rows nnz_lno_persistent_work_view_t long_rows_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "long_rows_per_color"), numColors); nnz_lno_persistent_work_view_t max_row_length_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "max_row_length_per_color"), numColors); nnz_lno_t mostLongRowsInColor = 0; @@ -930,33 +937,38 @@ class PointGaussSeidel { max_row_length_per_color); int sortLongRowsTeamSize = 1; { - team_policy_t temp(1, 1); + team_policy_t temp(my_exec_space, 1, 1); sortLongRowsTeamSize = temp.team_size_recommended( sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag()); } - Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize), - sortIntoLongRowsFunctor, - Kokkos::Max(mostLongRowsInColor)); + Kokkos::parallel_reduce( + team_policy_t(my_exec_space, numColors, sortLongRowsTeamSize), + sortIntoLongRowsFunctor, Kokkos::Max(mostLongRowsInColor)); auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color); - Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color); + Kokkos::deep_copy(my_exec_space, host_long_rows_per_color, + long_rows_per_color); + my_exec_space.fence(); gsHandle->set_long_rows_per_color(host_long_rows_per_color); auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color); - Kokkos::deep_copy(host_max_row_length_per_color, + Kokkos::deep_copy(my_exec_space, host_max_row_length_per_color, max_row_length_per_color); + my_exec_space.fence(); gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); scalar_persistent_work_view_t long_row_x( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "long_row_x"), mostLongRowsInColor); gsHandle->set_long_row_x(long_row_x); } else { // Just sort rows by ID. KokkosSparse::sort_crs_graph(color_xadj, color_adj); + decltype(color_adj)>(my_exec_space, + color_xadj, color_adj); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SORT_TIME:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -968,29 +980,29 @@ class PointGaussSeidel { Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), create_permuted_xadj(color_adj, xadj, permuted_xadj, old_to_new_map)); // std::cout << "create_permuted_xadj" << std::endl; #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl; timer.reset(); #endif KokkosKernels::Impl::inclusive_parallel_prefix_sum< - row_lno_persistent_work_view_t, MyExecSpace>(num_rows + 1, - permuted_xadj); + row_lno_persistent_work_view_t, MyExecSpace>( + my_exec_space, num_rows + 1, permuted_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl; timer.reset(); #endif Kokkos::parallel_for("KokkosSparse::PointGaussSeidel::fill_matrix_symbolic", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_symbolic(num_rows, color_adj, xadj, adj, // adj_vals, permuted_xadj, permuted_adj, @@ -998,7 +1010,7 @@ class PointGaussSeidel { old_to_new_map)); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -1012,8 +1024,8 @@ class PointGaussSeidel { // first calculate max row size. size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size( - num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, - max_row_size); + my_exec_space, num_rows, permuted_xadj.data(), + permuted_xadj.data() + 1, max_row_size); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; @@ -1079,10 +1091,11 @@ class PointGaussSeidel { size_type num_large_rows = 0; KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( - brows, permuted_xadj, num_values_in_l1, num_large_rows); + my_exec_space, brows, permuted_xadj, num_values_in_l1, + num_large_rows); num_big_rows = KOKKOSKERNELS_MACRO_MIN( num_large_rows, - (size_type)(MyExecSpace().concurrency() / suggested_vector_size)); + (size_type)(my_exec_space.concurrency() / suggested_vector_size)); // std::cout << "num_big_rows:" << num_big_rows << std::endl; if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { @@ -1091,8 +1104,8 @@ class PointGaussSeidel { size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< - typename pool_memory_space::memory_space>(free_byte, - total_byte); + typename pool_memory_space::memory_space>(free_byte, total_byte, + num_streams); size_t required_size = size_t(num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte) { num_big_rows = @@ -1125,6 +1138,7 @@ class PointGaussSeidel { gsHandle->set_new_adj(permuted_adj); gsHandle->set_old_to_new_map(old_to_new_map); gsHandle->set_call_symbolic(true); + my_exec_space.fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "ALLOC:" << timer.seconds() << std::endl; #endif @@ -1335,7 +1349,14 @@ class PointGaussSeidel { if (gsHandle->is_symbolic_called() == false) { this->initialize_symbolic(); } - // else + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // else #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif @@ -1343,6 +1364,7 @@ class PointGaussSeidel { const_lno_row_view_t xadj = this->row_map; const_lno_nnz_view_t adj = this->entries; const_scalar_nnz_view_t adj_vals = this->values; + MyExecSpace my_exec_space = gsHandle->get_execution_space(); size_type nnz = adj_vals.extent(0); @@ -1353,14 +1375,16 @@ class PointGaussSeidel { nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); scalar_persistent_work_view_t permuted_adj_vals( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "newvals_"), nnz); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "newvals_"), + nnz); int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz); int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); nnz_lno_t block_size = gsHandle->get_block_size(); nnz_lno_t block_matrix_size = block_size * block_size; @@ -1384,7 +1408,8 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), fill_matrix_numeric(color_adj, xadj, // adj, @@ -1396,7 +1421,7 @@ class PointGaussSeidel { block_matrix_size)); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_numeric(color_adj, xadj, // adj, adj_vals, newxadj_, @@ -1409,7 +1434,7 @@ class PointGaussSeidel { gsHandle->set_new_adj_val(permuted_adj_vals); scalar_persistent_work_view_t permuted_inverse_diagonal( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "permuted_inverse_diagonal"), num_rows * block_size); if (!have_diagonal_given) { @@ -1421,13 +1446,14 @@ class PointGaussSeidel { block_size > 1) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::team_get_matrix_diagonals", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); } else { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::get_matrix_diagonals", - range_pol(0, num_rows), gmd); + range_policy_t(my_exec_space, 0, num_rows), gmd); } } else { @@ -1435,21 +1461,20 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_block_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, given_inverse_diagonal, - permuted_inverse_diagonal); + my_exec_space, num_rows, block_size, old_to_new_map, + given_inverse_diagonal, permuted_inverse_diagonal); else KokkosKernels::Impl::permute_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, given_inverse_diagonal, + my_exec_space, num_rows, old_to_new_map, given_inverse_diagonal, permuted_inverse_diagonal); } - gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal); gsHandle->set_call_numeric(true); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } @@ -1511,24 +1536,25 @@ class PointGaussSeidel { scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal(); - color_t numColors = gsHandle->get_num_colors(); + color_t numColors = gsHandle->get_num_colors(); + auto my_exec_space = gsHandle->get_execution_space(); if (update_y_vector) { KokkosKernels::Impl::permute_block_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, y_rhs_input_vec, + my_exec_space, num_rows, block_size, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols * block_size, - Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, old_to_new_map, x_lhs_output_vec, + my_exec_space, num_cols, block_size, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); } @@ -1561,7 +1587,7 @@ class PointGaussSeidel { int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), brows); + suggested_team_size, my_exec_space.concurrency(), brows); // size_t shmem_size_to_use = this->handle->get_shmem_size(); size_t l1_shmem_size = gsHandle->get_level_1_mem(); @@ -1591,13 +1617,11 @@ class PointGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, block_size, color_adj, Permuted_Xvector, + x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); @@ -1615,7 +1639,8 @@ class PointGaussSeidel { nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); auto Permuted_Xvector = gsHandle->get_permuted_x_vector(); auto Permuted_Yvector = gsHandle->get_permuted_y_vector(); @@ -1635,16 +1660,19 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); + my_exec_space, num_rows, old_to_new_map, y_rhs_input_vec, + Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols, Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); + my_exec_space, num_cols, old_to_new_map, x_lhs_output_vec, + Permuted_Xvector); } #if KOKKOSSPARSE_IMPL_PRINTDEBUG @@ -1673,14 +1701,12 @@ class PointGaussSeidel { apply_backward); } - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG + Kokkos::fence(); std::cout << "--point After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); std::cout << "--point Result X:"; @@ -1699,6 +1725,14 @@ class PointGaussSeidel { if (gsHandle->is_numeric_called() == false) { this->initialize_numeric(); } + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // make sure x and y have been allocated with the correct dimensions nnz_lno_t block_size = gsHandle->get_block_size(); gsHandle->allocate_x_y_vectors(this->num_rows * block_size, @@ -1719,7 +1753,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1730,7 +1765,7 @@ class PointGaussSeidel { max_row_length_per_color = gsHandle->get_max_row_length_per_color(); long_row_x = gsHandle->get_long_row_x(); haveLongRows = true; - longrow_apply_team_policy_t tempPolicy(1, 1); + longrow_apply_team_policy_t tempPolicy(my_exec_space, 1, 1); longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag()); } @@ -1778,25 +1813,34 @@ class PointGaussSeidel { if (block_size == 1) { Kokkos::parallel_for( labelRegular, - team_policy_t((numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + team_policy_t(my_exec_space, + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else if (gs.num_max_vals_in_l2 == 0) { Kokkos::parallel_for( labelBlock, - block_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + block_apply_team_policy_t( + my_exec_space, + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else { Kokkos::parallel_for( labelBigBlock, - bigblock_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + bigblock_apply_team_policy_t( + my_exec_space, + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -1815,15 +1859,22 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, - longrow_apply_team_policy_t(numLongRows * teams_per_row, - longRowTeamSize), + Kokkos::Experimental::require( + longrow_apply_team_policy_t(my_exec_space, + numLongRows * teams_per_row, + longRowTeamSize), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::require( + range_policy_t(my_exec_space, + color_index_end - numLongRows, + color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1838,7 +1889,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1874,7 +1926,10 @@ class PointGaussSeidel { if (numRegularRows) { Kokkos::parallel_for( labelShort, - range_pol(color_index_begin, color_index_end - numLongRows), + Kokkos::Experimental::require( + range_policy_t(my_exec_space, color_index_begin, + color_index_end - numLongRows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } if (numLongRows) { @@ -1889,14 +1944,21 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); - Kokkos::parallel_for(labelLong, - Kokkos::RangePolicy( - 0, numLongRows * par_per_row), - gs); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); + Kokkos::parallel_for( + labelLong, + Kokkos::Experimental::require( + Kokkos::RangePolicy( + my_exec_space, 0, numLongRows * par_per_row), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + Kokkos::Experimental::require( + range_policy_t(my_exec_space, + color_index_end - numLongRows, + color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index f04ae34fc9..840ced73b8 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -120,41 +120,47 @@ struct gauss_seidel_apply_eti_spec_avail { namespace KokkosSparse { namespace Impl { -template ::value, bool eti_spec_avail = gauss_seidel_symbolic_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t>::value> struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, bool is_graph_symmetric); }; template < - class KernelHandle, KokkosSparse::SparseMatrixFormat format, - class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, + class ExecSpaceIn, class KernelHandle, + KokkosSparse::SparseMatrixFormat format, class a_size_view_t_, + class a_lno_view_t, class a_scalar_view_t, bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value, bool eti_spec_avail = gauss_seidel_numeric_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value> struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric); static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); }; -template ::value, @@ -163,7 +169,8 @@ template ::value> struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -174,15 +181,19 @@ struct GAUSS_SEIDEL_APPLY { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct GAUSS_SEIDEL_SYMBOLIC +struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t_ entries, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_symbolic"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel< KernelHandle, a_size_view_t_, a_lno_view_t_, @@ -206,17 +217,20 @@ struct GAUSS_SEIDEL_SYMBOLIC -struct GAUSS_SEIDEL_NUMERIC +struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidelget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel -struct GAUSS_SEIDEL_APPLY { +template +struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -289,6 +308,7 @@ struct GAUSS_SEIDEL_APPLYget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel, \ @@ -337,6 +358,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -352,6 +374,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -367,6 +390,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -386,6 +410,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -401,6 +426,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -420,6 +446,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -441,6 +468,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -466,6 +494,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -487,6 +516,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index d8754e591c..3adb42454b 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -18,6 +18,9 @@ #define KOKKOSSPARSE_MDF_IMPL_HPP_ #include +#include +#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_findRelOffset.hpp" #include #include "Kokkos_ArithTraits.hpp" @@ -40,31 +43,45 @@ struct MDF_count_lower { entries_type::non_const_type; using size_type = typename crs_matrix_type::ordinal_type; using value_type = typename crs_matrix_type::size_type; + using KAV = typename Kokkos::ArithTraits; crs_matrix_type A; col_ind_type permutation; col_ind_type permutation_inv; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + MDF_count_lower(crs_matrix_type A_, col_ind_type permutation_, col_ind_type permutation_inv_) : A(A_), permutation(permutation_), permutation_inv(permutation_inv_){}; KOKKOS_INLINE_FUNCTION - void operator()(const size_type rowIdx, value_type& update) const { - permutation(rowIdx) = rowIdx; - permutation_inv(rowIdx) = rowIdx; - for (value_type entryIdx = A.graph.row_map(rowIdx); - entryIdx < A.graph.row_map(rowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) <= rowIdx) { - update += 1; - } - } + void operator()(const team_member_t team, value_type& update) const { + const auto rowIdx = team.league_rank(); + const auto rowView = A.graph.rowConst(rowIdx); + + value_type local_contrib = KAV::zero(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type entryIdx, value_type& partial) { + if (rowView(entryIdx) <= rowIdx) partial += 1; + }, + Kokkos::Sum(local_contrib)); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + permutation(rowIdx) = rowIdx; + permutation_inv(rowIdx) = rowIdx; + update += local_contrib; + }); } - }; // MDF_count_lower -template +template struct MDF_discarded_fill_norm { + using device_type = typename crs_matrix_type::device_type; + using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; @@ -76,10 +93,14 @@ struct MDF_discarded_fill_norm { using KAS = typename Kokkos::ArithTraits; using scalar_mag_type = typename KAS::mag_type; using KAM = typename Kokkos::ArithTraits; + using permutation_set_type = + Kokkos::UnorderedMap; crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; + permutation_set_type permutation_set; + col_ind_type update_list; values_mag_type discarded_fill; col_ind_type deficiency; @@ -88,240 +109,146 @@ struct MDF_discarded_fill_norm { MDF_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, ordinal_type factorization_step_, col_ind_type permutation_, + permutation_set_type permutation_set_, values_mag_type discarded_fill_, - col_ind_type deficiency_, int verbosity_) + col_ind_type deficiency_, int verbosity_, + col_ind_type update_list_ = col_ind_type{}) : A(A_), At(At_), factorization_step(factorization_step_), permutation(permutation_), + permutation_set(permutation_set_), + update_list(update_list_), discarded_fill(discarded_fill_), deficiency(deficiency_), verbosity(verbosity_){}; - KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(i); - scalar_mag_type discard_norm = KAM::zero(); - scalar_type diag_val = KAS::zero(); - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; - for (size_type alphaIdx = At.graph.row_map(rowIdx); - alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if (fillRowIdx != rowIdx && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(rowIdx); - betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + + struct DiscNormReducer { + using reducer = DiscNormReducer; + struct value_type { + scalar_mag_type discarded_norm; + ordinal_type numFillEntries; + scalar_type diag_val; + }; + using result_view_type = Kokkos::View; + + private: + result_view_type value; + + public: + KOKKOS_INLINE_FUNCTION + DiscNormReducer(value_type& value_) : value(&value_) {} + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.discarded_norm += src.discarded_norm; + dest.numFillEntries += src.numFillEntries; + if (dest.diag_val == KAS::zero()) dest.diag_val = src.diag_val; + } - if (fillColIdx != rowIdx && col_not_eliminated) { - entryIsDiscarded = true; - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - entryIsDiscarded = false; - } - } - if (entryIsDiscarded) { - numFillEntries += 1; - discard_norm += - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - int(At.graph.entries(alphaIdx)), - int(A.graph.entries(betaIdx)), - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), - int(rowIdx)); - } - } - } - } - } - } else if (fillRowIdx == rowIdx) { - diag_val = At.values(alphaIdx); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value detected, values(%d)=%f\n", int(rowIdx), - int(alphaIdx), At.values(alphaIdx)); - } else if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value detected, |values(%d)|=%f\n", - int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); - } - } - } + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.discarded_norm = Kokkos::reduction_identity::sum(); + val.numFillEntries = Kokkos::reduction_identity::sum(); + val.diag_val = KAS::zero(); } - // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / KAS::abs(diag_val * diag_val); - discarded_fill(rowIdx) = discard_norm; - deficiency(rowIdx) = numFillEntries; - - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAM::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); - } + KOKKOS_INLINE_FUNCTION + static value_type init() { + value_type out; + init(out); + return out; } - } -}; // MDF_discarded_fill_norm + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } -template -struct MDF_selective_discarded_fill_norm { - using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using col_ind_type = - typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; - using scalar_mag_type = typename KAS::mag_type; - using KAM = typename Kokkos::ArithTraits; - using values_mag_type = typename MDF_types::values_mag_type; + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + }; - crs_matrix_type A, At; - ordinal_type factorization_step; - col_ind_type permutation; - col_ind_type update_list; + KOKKOS_INLINE_FUNCTION + void operator()(team_member_t team) const { + const ordinal_type rowIdx = + is_initial_fill ? permutation(team.league_rank()) + : permutation(update_list(team.league_rank())); + const auto colView = At.rowConst(rowIdx); + const auto rowView = A.rowConst(rowIdx); + + using reduction_val_t = typename DiscNormReducer::value_type; + reduction_val_t reduction_val = DiscNormReducer::init(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, reduction_val_t& running_disc_norm) { + const ordinal_type fillRowIdx = colView.colidx(alpha); + + // Record diagonal term + if (fillRowIdx == rowIdx) { + Kokkos::single(Kokkos::PerThread(team), [&] { + running_disc_norm.diag_val = colView.value(alpha); + }); + return; + } - values_mag_type discarded_fill; - col_ind_type deficiency; - int verbosity; + // Check if row already eliminated + if constexpr (!is_initial_fill) { + if (permutation_set.exists(fillRowIdx)) return; + } - MDF_selective_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, - ordinal_type factorization_step_, - col_ind_type permutation_, - col_ind_type update_list_, - values_mag_type discarded_fill_, - col_ind_type deficiency_, int verbosity_) - : A(A_), - At(At_), - factorization_step(factorization_step_), - permutation(permutation_), - update_list(update_list_), - discarded_fill(discarded_fill_), - deficiency(deficiency_), - verbosity(verbosity_){}; + const auto fillRowView = A.rowConst(fillRowIdx); + reduction_val_t local_reduction_val = DiscNormReducer::init(); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, rowView.length), + [&](const ordinal_type beta, + reduction_val_t& vect_running_disc_norm) { + const ordinal_type fillColIdx = rowView.colidx(beta); - KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(update_list(i)); - scalar_mag_type discard_norm = KAM::zero(); - scalar_type diag_val = KAS::zero(); - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; - for (size_type alphaIdx = At.graph.row_map(rowIdx); - alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if (fillRowIdx != rowIdx && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(rowIdx); - betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } + if (fillColIdx == rowIdx) return; - if (fillColIdx != rowIdx && col_not_eliminated) { - entryIsDiscarded = true; - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - entryIsDiscarded = false; - } - } - if (entryIsDiscarded) { - numFillEntries += 1; - discard_norm += - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - static_cast(At.graph.entries(alphaIdx)), - static_cast(A.graph.entries(betaIdx)), - static_cast( - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), - static_cast(rowIdx)); + if constexpr (!is_initial_fill) { + if (permutation_set.exists(fillColIdx)) return; } - } - } - } - } - } else if (fillRowIdx == rowIdx) { - diag_val = At.values(alphaIdx); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(At.values(alphaIdx))); - } else if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, |values(%d)|=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(KAS::abs(At.values(alphaIdx)))); - } - } - } - } - // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / KAS::abs(diag_val * diag_val); - discarded_fill(rowIdx) = discard_norm; - deficiency(rowIdx) = numFillEntries; - - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAM::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); - } - } + bool entryIsDiscarded = true; + for (ordinal_type gamma = 0; gamma < fillRowView.length; + ++gamma) { + if (fillRowView.colidx(gamma) == fillColIdx) { + entryIsDiscarded = false; + } + } + if (entryIsDiscarded) { + vect_running_disc_norm.numFillEntries += 1; + vect_running_disc_norm.discarded_norm += + KAS::abs(colView.value(alpha) * rowView.value(beta)) * + KAS::abs(colView.value(alpha) * rowView.value(beta)); + } + }, + DiscNormReducer(local_reduction_val)); + + Kokkos::single(Kokkos::PerThread(team), [&] { + running_disc_norm.discarded_norm += + local_reduction_val.discarded_norm; + running_disc_norm.numFillEntries += + local_reduction_val.numFillEntries; + }); + }, + DiscNormReducer(reduction_val)); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + const scalar_mag_type& discard_norm = reduction_val.discarded_norm; + const ordinal_type& numFillEntries = reduction_val.numFillEntries; + const scalar_type& diag_val = reduction_val.diag_val; + + // TODO add a check on `diag_val == zero` + discarded_fill(rowIdx) = discard_norm / KAS::abs(diag_val * diag_val); + deficiency(rowIdx) = numFillEntries; + }); } - -}; // MDF_selective_discarded_fill_norm +}; // MDF_discarded_fill_norm template struct MDF_select_row { @@ -429,14 +356,25 @@ struct MDF_select_row { } KOKKOS_INLINE_FUNCTION - void init(value_type& dst) const { - dst = Kokkos::ArithTraits::zero(); - } + void init(value_type& dst) const { dst = factorization_step; } }; // MDF_select_row +template +KOKKOS_INLINE_FUNCTION bool sorted_view_contains( + const view_type& values, const ordinal_type size, + typename view_type::const_value_type search_val) { + return KokkosSparse::findRelOffset(values, size, search_val, size, true) != + size; +} + template struct MDF_factorize_row { + using device_type = typename crs_matrix_type::device_type; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -447,6 +385,8 @@ struct MDF_factorize_row { using value_type = typename crs_matrix_type::value_type; using values_mag_type = typename MDF_types::values_mag_type; using value_mag_type = typename values_mag_type::value_type; + using permutation_set_type = + Kokkos::UnorderedMap; crs_matrix_type A, At; @@ -459,10 +399,13 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; + permutation_set_type permutation_set; values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; + col_ind_type update_list; + int verbosity; MDF_factorize_row(crs_matrix_type A_, crs_matrix_type At_, @@ -470,9 +413,11 @@ struct MDF_factorize_row { values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, + permutation_set_type permutation_set_, values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, - ordinal_type factorization_step_, int verbosity_) + ordinal_type factorization_step_, + col_ind_type& update_list_, int verbosity_) : A(A_), At(At_), row_mapL(row_mapL_), @@ -483,277 +428,306 @@ struct MDF_factorize_row { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + permutation_set(permutation_set_), discarded_fill(discarded_fill_), factored(factored_), selected_row_idx(selected_row_idx_), factorization_step(factorization_step_), + update_list(update_list_), verbosity(verbosity_){}; + // Phase 2, do facrotization KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type /* idx */) const { - const ordinal_type selected_row = permutation(selected_row_idx); - discarded_fill(selected_row) = Kokkos::ArithTraits::max(); - - // Swap entries in permutation vectors - permutation(selected_row_idx) = permutation(factorization_step); - permutation(factorization_step) = selected_row; - permutation_inv(permutation(factorization_step)) = factorization_step; - permutation_inv(permutation(selected_row_idx)) = selected_row_idx; - - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); - for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(permutation(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - - // Insert the upper part of the selected row in U - // including the diagonal term. - value_type diag = Kokkos::ArithTraits::zero(); - size_type U_entryIdx = row_mapU(factorization_step); - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { - entriesU(U_entryIdx) = A.graph.entries(entryIdx); - valuesU(U_entryIdx) = A.values(entryIdx); - ++U_entryIdx; - if (A.graph.entries(entryIdx) == selected_row) { - diag = A.values(entryIdx); - } - } - } - row_mapU(factorization_step + 1) = U_entryIdx; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - static_cast(selected_row), - static_cast(diag)); - } - - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); - for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; - ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(row_mapU(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - - // Insert the lower part of the selected column of A - // divided by its the diagonal value to obtain a unit - // diagonal value in L. - size_type L_entryIdx = row_mapL(factorization_step); - entriesL(L_entryIdx) = selected_row; - valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); - ++L_entryIdx; - for (size_type entryIdx = At.graph.row_map(selected_row); - entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { - if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { - entriesL(L_entryIdx) = At.graph.entries(entryIdx); - valuesL(L_entryIdx) = At.values(entryIdx) / diag; - ++L_entryIdx; - } - } - row_mapL(factorization_step + 1) = L_entryIdx; - - if constexpr (std::is_arithmetic_v) { - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - static_cast(factorization_step), - static_cast(factorization_step), - static_cast(factorization_step + 1), - static_cast(row_mapL(factorization_step)), - static_cast(row_mapL(factorization_step + 1))); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - - // If this was the last row no need to update A and At! - if (factorization_step == A.numRows() - 1) { - return; - } - - // Finally we want to update A and At with the values - // that where not discarded during factorization. - // Note: this is almost the same operation as computing - // the norm of the discarded fill... - - // First step: find the diagonal entry in selected_row - value_type diag_val = Kokkos::ArithTraits::zero(); - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - ordinal_type colIdx = A.graph.entries(entryIdx); - if (selected_row == colIdx) { - diag_val = A.values(entryIdx); - } - } + void operator()(team_member_t team) const { + const auto alpha = team.league_rank(); + const ordinal_type selected_row = permutation(factorization_step); + const auto colView = At.rowConst(selected_row); + + const auto rowInd = colView.colidx(alpha); + if (rowInd == selected_row) return; + + if (permutation_set.exists(rowInd)) return; + + // Only one of the values will match selected so can just sum all contribs + const auto rowView = A.rowConst(selected_row); + value_type diag = Kokkos::ArithTraits::zero(); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, rowView.length), + [&](const size_type ind, value_type& running_diag) { + if (rowView.colidx(ind) == selected_row) + running_diag = rowView.value(ind); + }, + Kokkos::Sum(diag)); // Extract alpha and beta vectors // Then insert alpha*beta/diag_val if the corresponding // entry in A is non-zero. - for (size_type alphaIdx = At.graph.row_map(selected_row); - alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if ((fillRowIdx != selected_row) && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(selected_row); - betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } + auto fillRowView = A.row(rowInd); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const ordinal_type beta) { + const auto colInd = rowView.colidx(beta); - if ((fillColIdx != selected_row) && col_not_eliminated) { - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - A.values(entryIdx) -= - At.values(alphaIdx) * A.values(betaIdx) / diag_val; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", static_cast(fillRowIdx), - static_cast(fillColIdx), - static_cast(At.values(alphaIdx) * - A.values(betaIdx) / diag_val)); - } - } - } - } + if (colInd == selected_row) return; - for (size_type entryIdx = At.graph.row_map(fillColIdx); - entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { - if (At.graph.entries(entryIdx) == fillRowIdx) { - At.values(entryIdx) -= - At.values(alphaIdx) * A.values(betaIdx) / diag_val; - } - } - } - } - } - } + if (permutation_set.exists(colInd)) return; - factored(selected_row) = 1; - - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); - for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "%f ", static_cast(A.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); - for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "%f ", static_cast(At.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - } // operator() + const auto subVal = colView.value(alpha) * rowView.value(beta) / diag; -}; // MDF_factorize_row + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, fillRowView.length), + [&](const ordinal_type gamma) { + if (colInd == fillRowView.colidx(gamma)) { + Kokkos::atomic_sub(&fillRowView.value(gamma), subVal); + } + }); + + auto fillColView = At.row(colInd); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, fillColView.length), + [&](const ordinal_type delt) { + if (rowInd == fillColView.colidx(delt)) { + Kokkos::atomic_sub(&fillColView.value(delt), subVal); + } + }); + }); + } +}; template struct MDF_compute_list_length { + using device_type = typename crs_matrix_type::device_type; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: + row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using size_type = typename crs_matrix_type::size_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + using value_type = typename crs_matrix_type::value_type; + using values_mag_type = typename MDF_types::values_mag_type; + using value_mag_type = typename values_mag_type::value_type; - ordinal_type selected_row_idx; - crs_matrix_type A; - crs_matrix_type At; - col_ind_type permutation; + using permutation_set_type = + Kokkos::UnorderedMap; + + crs_matrix_type A, At; + + row_map_type row_mapL; + col_ind_type entriesL; + values_type valuesL; + + row_map_type row_mapU; + col_ind_type entriesU; + values_type valuesU; + + col_ind_type permutation, permutation_inv; + permutation_set_type permutation_set; + values_mag_type discarded_fill; col_ind_type factored; - col_ind_type update_list_length; + ordinal_type selected_row_idx, factorization_step; + col_ind_type update_list; - MDF_compute_list_length(const ordinal_type rowIdx_, const crs_matrix_type& A_, - const crs_matrix_type& At_, - const col_ind_type& permutation_, - const col_ind_type factored_, - col_ind_type& update_list_length_, - col_ind_type& update_list_) - : selected_row_idx(rowIdx_), - A(A_), + int verbosity; + + MDF_compute_list_length( + crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, + col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, + col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, + col_ind_type permutation_inv_, permutation_set_type permutation_set_, + values_mag_type discarded_fill_, col_ind_type factored_, + ordinal_type selected_row_idx_, ordinal_type factorization_step_, + col_ind_type& update_list_, int verbosity_) + : A(A_), At(At_), + row_mapL(row_mapL_), + entriesL(entriesL_), + valuesL(valuesL_), + row_mapU(row_mapU_), + entriesU(entriesU_), + valuesU(valuesU_), permutation(permutation_), + permutation_inv(permutation_inv_), + permutation_set(permutation_set_), + discarded_fill(discarded_fill_), factored(factored_), - update_list_length(update_list_length_), - update_list(update_list_) {} + selected_row_idx(selected_row_idx_), + factorization_step(factorization_step_), + update_list(update_list_), + verbosity(verbosity_){}; + // Phase 1, update list length KOKKOS_INLINE_FUNCTION - void operator()(const size_type /*idx*/) const { - const ordinal_type selected_row = permutation(selected_row_idx); - - size_type updateIdx = 0; - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - if ((A.graph.entries(entryIdx) != selected_row) && - (factored(A.graph.entries(entryIdx)) != 1)) { - update_list(updateIdx) = A.graph.entries(entryIdx); - ++updateIdx; - } + void operator()(const team_member_t team, ordinal_type& update_list_len, + ordinal_type& selected_row_len) const { + ordinal_type selected_row = 0; + + size_type U_entryIdx = row_mapU(factorization_step); + size_type L_entryIdx = row_mapL(factorization_step); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + selected_row = permutation(selected_row_idx); + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); + + // Swap entries in permutation vectors + permutation(selected_row_idx) = permutation(factorization_step); + permutation(factorization_step) = selected_row; + permutation_inv(permutation(factorization_step)) = factorization_step; + permutation_inv(permutation(selected_row_idx)) = selected_row_idx; + + // Diagonal value of L + entriesL(L_entryIdx) = selected_row; + valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); + + // Insert into permutation set for later + const auto res = permutation_set.insert(selected_row); + (void)res; // avoid unused error + assert(res.success()); + }); + ++L_entryIdx; + + // Only one thread has the selected row + team.team_reduce(Kokkos::Max(selected_row)); + const auto rowView = A.rowConst(selected_row); + const auto colView = At.rowConst(selected_row); + + // Insert the upper part of the selected row in U + // including the diagonal term. + ordinal_type updateIdx = 0; + value_type diag = Kokkos::ArithTraits::zero(); + { + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type alpha, ordinal_type& running_update, + bool is_final) { + const auto colInd = rowView.colidx(alpha); + if ((colInd != selected_row) && (factored(colInd) != 1)) { + if (is_final) { + update_list(running_update) = colInd; + ++updateIdx; + } + ++running_update; + } + } + // ,updateIdx + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(updateIdx)); + + // Sort update list + KokkosKernels::TeamBitonicSort(&update_list(0), updateIdx, team); + } + { + size_type numEntrU = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type alpha, size_type& running_nEntr, bool is_final) { + const auto colInd = rowView.colidx(alpha); + if (permutation_inv(colInd) >= factorization_step) { + if (is_final) { + ++numEntrU; + entriesU(U_entryIdx + running_nEntr) = colInd; + valuesU(U_entryIdx + running_nEntr) = rowView.value(alpha); + if (colInd == selected_row) diag = rowView.value(alpha); + } + ++running_nEntr; + } + } + // , numEntrU + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numEntrU)); + + U_entryIdx += numEntrU; } - size_type update_rows = updateIdx; - for (size_type entryIdx = At.graph.row_map(selected_row); - entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { - if ((At.graph.entries(entryIdx) != selected_row) && - (factored(A.graph.entries(entryIdx)) != 1)) { - bool already_updated = false; - for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { - if (At.graph.entries(entryIdx) == update_list(checkIdx)) { - already_updated = true; - break; + + // Only one thread found diagonal so just sum over all + team.team_reduce(Kokkos::Sum(diag)); + + // Insert the lower part of the selected column of A + // divided by its the diagonal value to obtain a unit + // diagonal value in L. + { + size_type numEntrL = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, size_type& running_nEntr, bool is_final) { + const auto rowInd = colView.colidx(alpha); + if (permutation_inv(rowInd) > factorization_step) { + if (is_final) { + ++numEntrL; + entriesL(L_entryIdx + running_nEntr) = rowInd; + valuesL(L_entryIdx + running_nEntr) = + colView.value(alpha) / diag; + } + ++running_nEntr; + } } - } - if (already_updated == false) { - update_list(updateIdx) = At.graph.entries(entryIdx); - ++updateIdx; - } - } + // , numEntrL + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numEntrL)); + + L_entryIdx += numEntrL; } - update_list_length(0) = updateIdx; + { + ordinal_type numUpdateL = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, ordinal_type& running_update, + bool is_final) { + const auto rowInd = colView.colidx(alpha); + if ((rowInd != selected_row) && (factored(rowInd) != 1)) { + // updateIdx currently holds the rows that were updated. don't add + // duplicates + const size_type& update_rows = updateIdx; + + const bool already_updated = + sorted_view_contains(update_list, update_rows, rowInd); + + if (!already_updated) { + // Cannot make use of vector ranges until + // https://github.com/kokkos/kokkos/issues/6259 is resolved + // Kokkos::single(Kokkos::PerThread(team),[&]{ + if (is_final) { + update_list(updateIdx + running_update) = rowInd; + ++numUpdateL; + } + ++running_update; + // }); + } + } + } + // , numUpdateL + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numUpdateL)); + + updateIdx += numUpdateL; + } + + Kokkos::single(Kokkos::PerTeam(team), [&] { + row_mapU(factorization_step + 1) = U_entryIdx; + row_mapL(factorization_step + 1) = L_entryIdx; + + update_list_len = updateIdx; + selected_row_len = rowView.length; + + factored(selected_row) = 1; + }); } }; diff --git a/sparse/impl/KokkosSparse_merge_matrix.hpp b/sparse/impl/KokkosSparse_merge_matrix.hpp new file mode 100644 index 0000000000..18c9467a9a --- /dev/null +++ b/sparse/impl/KokkosSparse_merge_matrix.hpp @@ -0,0 +1,301 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_MERGEMATRIX_HPP +#define KOKKOSSPARSE_MERGEMATRIX_HPP + +#include + +#include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_LowerBound.hpp" +#include "KokkosKernels_Predicates.hpp" +#include "KokkosKernels_SafeCompare.hpp" + +/// \file KokkosSparse_merge_matrix.hpp + +namespace KokkosSparse::Impl { + +// a joint index into a and b +template +struct MergeMatrixPosition { + using a_index_type = AIndex; + using b_index_type = BIndex; + + AIndex ai; + BIndex bi; +}; + +/*! \class MergeMatrixDiagonal + \brief a view into the entries of the Merge Matrix along a diagonal + + @tparam AView Type of the input view a, must be rank 1 + @tparam BViewLike Type of the view-like object b, must be Kokkos::View or + KokkosKernels::Iota Example merge matrix M of two arrays A (vertical) and B + (horizontal), as seen in Odeh, Green, Mwassi, Shmueli, Birk Merge Path - + Parallel Merging Made Simple 2012 M[i,j] = 1 iff A[i] > B[j] operator(k) + returns A[i] > B[j] at the kth entry of the diagonal + + 3 5 12 22 45 64 69 82 + ------------------------ + | / / + 17 | 1 1 1 0 0 0 0 0 + |/ / + 29 | 1 1 1 1 0 0 0 0 + | / + 35 | 1 1 1 1 0 0 0 0 + | / + 73 | 1 1 1 1 1 1 1 0 + | / + 86 | 1 1 1 1 1 1 1 1 + |/ + 90 | 1 1 1 1 1 1 1 1 + | + 95 | 1 1 1 1 1 1 1 1 + | + 99 | 1 1 1 1 1 1 1 1 + Diagonals are counted from the top-left. + Index into a diagonal from the bottom-left. + Shown on the figure above is the 1st and 5th diagonal + The 0th diagonal D_0 has length 0 + The 1st diagonal D_1 has length 1 + The 5th diagonal D_5 has length 5 + The 9th diagonal D_9 has length 7 + D_1(0) = 1 + D_5(0..3) = 1 + D_5(4) = 0 +*/ +template +class MergeMatrixDiagonal { + public: + static_assert(AView::rank == 1, "MergeMatrixDiagonal AView must be rank 1"); + static_assert(Kokkos::is_view_v || + KokkosKernels::Impl::is_iota_v, + "MergeMatrixDiagonal BViewLike must be Kokkos::View or " + "KokkosKernels::Iota"); + static_assert(BViewLike::rank == 1, + "MergeMatrixDiagonal BViewLike must be rank 1"); + + using execution_space = typename AView::execution_space; + + /** + * Define the types for index and value of each view + */ + using a_index_type = typename AView::size_type; + using b_index_type = typename BViewLike::size_type; + using a_value_type = typename AView::non_const_value_type; + using b_value_type = typename BViewLike::non_const_value_type; + + using position_type = MergeMatrixPosition; + + // implement bare minimum parts of the view interface + enum { rank = 1 }; + using non_const_value_type = bool; ///< Merge matrix entries are 0 or 1. + + using size_type = + typename std::conditional= + sizeof(typename BViewLike::size_type), + typename AView::size_type, + typename BViewLike::size_type>:: + type; ///< The larger of the two view types' size_types + + /** \brief Initializes the view a and view-like object b and the diagonal. + */ + KOKKOS_INLINE_FUNCTION + MergeMatrixDiagonal(const AView &a, const BViewLike &b, + const size_type diagonal) + : a_(a), b_(b), d_(diagonal) {} + MergeMatrixDiagonal() = default; + + /** + * Computes the position along a and b for a given diagonal di + * + * @param di Current diagonal + * @return The MatrixPosition corresponding to the current diagonal + */ + KOKKOS_INLINE_FUNCTION + position_type position(const size_type &di) const noexcept { + position_type pos; + if (0 == d_) { + pos.ai = 0; + pos.bi = 0; + return pos; + } else { + pos = diag_to_a_b(di); + pos.ai += 1; + return pos; + } + } + + /** + * Compares a[i] > b[j] along the diagonal at entry di + * + * @param di Current diagonal + * @return True if a[i] > b[j], false otherwise + */ + KOKKOS_INLINE_FUNCTION + bool operator()(const size_type di) const { + position_type pos = diag_to_a_b(di); + + if (pos.ai >= typename position_type::a_index_type(a_.size())) { + return true; // on the +a side out of matrix bounds is 1 + } else if (pos.bi >= typename position_type::b_index_type(b_.size())) { + return false; // on the +b side out of matrix bounds is 0 + } else { + return KokkosKernels::Impl::safe_gt(a_(pos.ai), b_(pos.bi)); + } + } + + /** + * Returns the length of the diagonal + * + * @return Length of the diagonal + */ + KOKKOS_INLINE_FUNCTION + size_type size() const noexcept { + if (d_ <= size_type(a_.size()) && d_ <= size_type(b_.size())) { + return d_; + } else if (d_ > size_type(a_.size()) && d_ > size_type(b_.size())) { + // TODO: this returns nonsense if d_ happens to be outside the merge + // matrix + return a_.size() + b_.size() - d_; + } else { + return KOKKOSKERNELS_MACRO_MIN(a_.size(), b_.size()); + } + } + + private: + /** + * Translates an index along the diagonal to indices into a_ and b_ + * + * @param di Current diagonal + * @return The corresponding MatrixPosition with indices into a_ and b_ + */ + KOKKOS_INLINE_FUNCTION + position_type diag_to_a_b(const size_type &di) const noexcept { + position_type res; + res.ai = d_ < size_type(a_.size()) ? (d_ - 1) - di : a_.size() - 1 - di; + res.bi = d_ < size_type(a_.size()) ? di : d_ + di - a_.size(); + return res; + } + + AView a_; ///< The a view + BViewLike b_; ///< The b view + size_type d_; ///< diagonal +}; + +/*! \brief Return the first index on diagonal \code diag + in the merge matrix of \code a and \code b that is not 1 +This is effectively a lower-bound search on the merge matrix diagonal +where the predicate is "equals 1" +*/ +template +KOKKOS_INLINE_FUNCTION + typename MergeMatrixDiagonal::position_type + diagonal_search( + const AView &a, const BViewLike &b, + typename MergeMatrixDiagonal::size_type diag) { + // unmanaged view types for a and b + using um_a_view = + Kokkos::View; + using um_b_view = + Kokkos::View; + + um_a_view ua(a.data(), a.size()); + + // if BViewLike is an Iota, pass it on directly to MMD, + // otherwise, create an unmanaged view of B + using b_type = + typename std::conditional::value, + BViewLike, um_b_view>::type; + + using MMD = MergeMatrixDiagonal; + MMD mmd; + if constexpr (KokkosKernels::Impl::is_iota::value) { + mmd = MMD(ua, b, diag); + } else { + b_type ub(b.data(), b.size()); + mmd = MMD(ua, ub, diag); + } + + // returns index of the first element that does not satisfy pred(element, + // value) our input view is the merge matrix entry along the diagonal, and we + // want the first one that is not true. so our predicate just tells us if the + // merge matrix diagonal entry is equal to true or not + const typename MMD::size_type idx = KokkosKernels::lower_bound_thread( + mmd, true, KokkosKernels::Equal()); + return mmd.position(idx); +} + +template +KOKKOS_INLINE_FUNCTION + typename MergeMatrixDiagonal::position_type + diagonal_search( + const TeamMember &handle, const AView &a, const BViewLike &b, + typename MergeMatrixDiagonal::size_type diag) { + // unmanaged view types for a and b + using um_a_view = + Kokkos::View; + using um_b_view = + Kokkos::View; + + um_a_view ua(a.data(), a.size()); + + // if BViewLike is an Iota, pass it on directly to MMD, + // otherwise, create an unmanaged view of B + using b_type = + typename std::conditional::value, + BViewLike, um_b_view>::type; + + using MMD = MergeMatrixDiagonal; + MMD mmd; + if constexpr (KokkosKernels::Impl::is_iota::value) { + mmd = MMD(ua, b, diag); + } else { + b_type ub(b.data(), b.size()); + mmd = MMD(ua, ub, diag); + } + + // returns index of the first element that does not satisfy pred(element, + // value) our input view is the merge matrix entry along the diagonal, and we + // want the first one that is not true. so our predicate just tells us if the + // merge matrix diagonal entry is equal to true or not + const typename MMD::size_type idx = KokkosKernels::lower_bound_team( + handle, mmd, true, KokkosKernels::Equal()); + return mmd.position(idx); +} + +/*! \brief + + \return A MergeMatrixDiagonal::position_type + */ +template +KOKKOS_INLINE_FUNCTION auto diagonal_search( + const View &a, typename View::non_const_value_type totalWork, + typename View::size_type diag) { + using value_type = typename View::non_const_value_type; + using size_type = typename View::size_type; + + KokkosKernels::Impl::Iota iota(totalWork); + return diagonal_search(a, iota, diag); +} + +} // namespace KokkosSparse::Impl + +#endif // KOKKOSSPARSE_MERGEMATRIX_HPP diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index c482aff429..0ac9c26166 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -60,8 +60,8 @@ struct IlutWrap { static size_type prefix_sum(RowMapType& row_map) { size_type result = 0; KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum< - RowMapType, typename IlutHandle::HandleExecSpace>(row_map.extent(0), - row_map, result); + typename IlutHandle::HandleExecSpace>(row_map.extent(0), row_map, + result); return result; } diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 62e074ff07..15132f9da3 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -523,8 +523,7 @@ void spadd_symbolic_impl( runSortedCountEntries( a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap); } else { // note: scoping individual parts of the process to free views sooner, @@ -542,8 +541,7 @@ void spadd_symbolic_impl( Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", range_type(0, nrows), countEntries); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap_upperbound); Kokkos::deep_copy(c_nnz_upperbound, Kokkos::subview(c_rowmap_upperbound, nrows)); @@ -585,8 +583,7 @@ void spadd_symbolic_impl( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", range_type(0, nrows), mergeEntries); // compute actual c_rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap); } addHandle->set_a_b_pos(a_pos, b_pos); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 1e61a66c84..5365970292 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -546,8 +546,7 @@ struct KokkosSPGEMM( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( m + 1, rowmapC, c_nnz_size); this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size); nnz_lno_t c_max_nnz = @@ -2188,8 +2187,7 @@ void KokkosSPGEMM< } #endif typename c_row_view_t::non_const_value_type c_nnz_size = 0; - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( m + 1, rowmapC, c_nnz_size); this->handle->get_spgemm_handle()->set_c_nnz(c_nnz_size); nnz_lno_t c_max_nnz = diff --git a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index dd1a7cd9b5..80d2fc1c04 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1818,8 +1818,7 @@ void KokkosSPGEMM( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( this->a_row_cnt + 1, rowmapC_); MyExecSpace().fence(); diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index abf44589f7..06fe6f094d 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -37,6 +37,40 @@ struct BsrMatrixSpMVTensorCoreFunctorParams { int leagueDim_y; }; +/*! \brief Can the tensor core impl be used in ExecutionSpace to operate on + AMatrix, XMatrix, and YMatrix? +*/ +template +class TensorCoresAvailable { +#if defined(KOKKOS_ENABLE_CUDA) + using AScalar = typename AMatrix::non_const_value_type; + using YScalar = typename YMatrix::non_const_value_type; + using XScalar = typename XMatrix::non_const_value_type; + + using a_mem_space = typename AMatrix::memory_space; + using x_mem_space = typename XMatrix::memory_space; + using y_mem_space = typename YMatrix::memory_space; + + template + constexpr static bool is_scalar() { + return std::is_scalar_v || + std::is_same_v, Kokkos::Experimental::half_t>; + } + + public: + constexpr static inline bool value = + Kokkos::SpaceAccessibility::accessible && + Kokkos::SpaceAccessibility::accessible && + Kokkos::SpaceAccessibility::accessible && + is_scalar() && is_scalar() && is_scalar() && + std::is_same_v; +#else + public: + constexpr static inline bool value = false; +#endif +}; + /// \brief Functor for the BsrMatrix SpMV multivector implementation utilizing /// tensor cores. /// @@ -48,7 +82,7 @@ struct BsrMatrixSpMVTensorCoreFunctorParams { /// TEAMS_PER_BLOCK_M and TEAMS_PER_BLOCK_N) if non-zero, statically-known /// launch parameters to reduce the cost of divmod operations on the GPU. If 0, /// provided runtime values will be used instead. -template ; typedef typename AMatrix::device_type Device; - typedef Kokkos::TeamPolicy team_policy; + typedef Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; typedef typename AMatrix::value_type AScalar; typedef typename YMatrix::value_type YScalar; @@ -181,12 +215,13 @@ struct BsrMatrixSpMVTensorCoreFunctor { } // execute the functor with provided launch parameters - void dispatch() { - typename BsrMatrixSpMVTensorCoreFunctor::team_policy policy(league_size(), - team_size()); + void dispatch(const execution_space &exec) { + typename BsrMatrixSpMVTensorCoreFunctor::team_policy policy( + exec, league_size(), team_size()); policy.set_scratch_size(0, Kokkos::PerTeam(team_scratch_size())); - Kokkos::parallel_for("KokkosSparse::BsrMatrixSpMVTensorCoreFunctor", policy, - *this); + Kokkos::parallel_for( + "KokkosSparse::Experimental::BsrMatrixSpMVTensorCoreFunctor", policy, + *this); } /* @@ -412,7 +447,7 @@ struct BsrMatrixSpMVTensorCoreFunctor { /// This is a struct instead of a function for template...using shorthand /// Discriminates between non-complex/on-GPU (supported) and otherwise /// (unsupported) scalar types, and throws a runtime error for unsupported types -template - using Dyn = BsrMatrixSpMVTensorCoreFunctor; + using Dyn = + BsrMatrixSpMVTensorCoreFunctor; // to be used when the various matrix types are supported - static void tag_dispatch(std::true_type, YScalar alpha, AMatrix a, XMatrix x, + static void tag_dispatch(std::true_type, const execution_space &exec, + const YScalar alpha, AMatrix a, XMatrix x, YScalar beta, YMatrix y) { BsrMatrixSpMVTensorCoreFunctorParams params = Dyn<0, 0, 0>::launch_parameters(alpha, a, x, beta, y); @@ -436,66 +473,48 @@ struct BsrMatrixSpMVTensorCoreDispatcher { if (false) { // consistency of formatting for next sections } else if (1 == params.leagueDim_x && 1 == params.teamsPerBlockM && 1 == params.teamsPerBlockN) { - Dyn<1, 1, 1>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 1, 1>(alpha, a, x, beta, y, params).dispatch(exec); } else if (1 == params.leagueDim_x && 2 == params.teamsPerBlockM && 2 == params.teamsPerBlockN) { - Dyn<1, 2, 2>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 2, 2>(alpha, a, x, beta, y, params).dispatch(exec); } else if (1 == params.leagueDim_x && 4 == params.teamsPerBlockM && 4 == params.teamsPerBlockN) { - Dyn<1, 4, 4>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 4, 4>(alpha, a, x, beta, y, params).dispatch(exec); } else if (1 == params.leagueDim_x && 8 == params.teamsPerBlockM && 8 == params.teamsPerBlockN) { - Dyn<1, 8, 8>(alpha, a, x, beta, y, params).dispatch(); + Dyn<1, 8, 8>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 1 == params.teamsPerBlockM && 1 == params.teamsPerBlockN) { - Dyn<2, 1, 1>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 1, 1>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 2 == params.teamsPerBlockM && 2 == params.teamsPerBlockN) { - Dyn<2, 2, 2>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 2, 2>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 4 == params.teamsPerBlockM && 4 == params.teamsPerBlockN) { - Dyn<2, 4, 4>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 4, 4>(alpha, a, x, beta, y, params).dispatch(exec); } else if (2 == params.leagueDim_x && 8 == params.teamsPerBlockM && 8 == params.teamsPerBlockN) { - Dyn<2, 8, 8>(alpha, a, x, beta, y, params).dispatch(); + Dyn<2, 8, 8>(alpha, a, x, beta, y, params).dispatch(exec); } else { - Dyn<0, 0, 0>(alpha, a, x, beta, y, params).dispatch(); + Dyn<0, 0, 0>(alpha, a, x, beta, y, params).dispatch(exec); } } // to be used to avoid instantiating on unsupported types - static void tag_dispatch(std::false_type, YScalar, AMatrix, XMatrix, YScalar, - YMatrix) { + static void tag_dispatch(std::false_type, const execution_space &, YScalar, + AMatrix, XMatrix, YScalar, YMatrix) { KokkosKernels::Impl::throw_runtime_exception( "Tensor core SpMV is only supported for non-complex types in GPU " "execution spaces"); } - - /*true if none of T1, T2, or T3 are complex*/ - template - struct none_complex { - const static bool value = !Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex && - !Kokkos::ArithTraits::is_complex; - }; - - /*true if T1::execution_space, T2, or T3 are all GPU exec space*/ - template - struct all_gpu { - const static bool value = KokkosKernels::Impl::kk_is_gpu_exec_space() && - KokkosKernels::Impl::kk_is_gpu_exec_space() && - KokkosKernels::Impl::kk_is_gpu_exec_space(); - }; - - static void dispatch(YScalar alpha, AMatrix a, XMatrix x, YScalar beta, - YMatrix y) { + static void dispatch(const execution_space &exec, YScalar alpha, AMatrix a, + XMatrix x, YScalar beta, YMatrix y) { // tag will be false unless all conditions are met - using tag = std::integral_constant< - bool, none_complex::value && - all_gpu::value>; - tag_dispatch(tag{}, alpha, a, x, beta, y); + using tag = + std::integral_constant::value>; + tag_dispatch(tag{}, exec, alpha, a, x, beta, y); } }; @@ -663,6 +682,7 @@ template ()>::type * = nullptr> void spMatVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -672,9 +692,9 @@ void spMatVec_no_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); // // Treat the case y <- alpha * A * x + beta * y @@ -701,14 +721,14 @@ void spMatVec_no_transpose( "KokkosSparse::bspmv", Kokkos::RangePolicy< typename AMatrix_Internal::device_type::execution_space, - Kokkos::Schedule>(0, A.numRows()), + Kokkos::Schedule>(exec, 0, A.numRows()), func); } else { Kokkos::parallel_for( "KokkosSparse::bspmv", Kokkos::RangePolicy< typename AMatrix_Internal::device_type::execution_space, - Kokkos::Schedule>(0, A.numRows()), + Kokkos::Schedule>(exec, 0, A.numRows()), func); } } @@ -723,6 +743,7 @@ template ()>::type * = nullptr> void spMatVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -785,11 +806,11 @@ void spMatVec_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bspmv", policy, func); } else { @@ -798,11 +819,11 @@ void spMatVec_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bspmv", policy, func); } @@ -974,6 +995,7 @@ template ()>::type * = nullptr> void spMatVec_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -983,9 +1005,9 @@ void spMatVec_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); if (alpha == Kokkos::ArithTraits::zero()) return; @@ -1033,7 +1055,8 @@ template ()>::type * = nullptr> -void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, +void spMatVec_transpose(const typename AMatrix::execution_space &exec, + const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { @@ -1045,7 +1068,10 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, const auto block_dim = A.blockDim(); - KokkosBlas::scal(y, beta, y); + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule @@ -1092,11 +1118,11 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1104,7 +1130,7 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1113,11 +1139,11 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, func); } else { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1125,7 +1151,7 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * @@ -1298,6 +1324,7 @@ template ()>::type * = nullptr> void spMatMultiVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -1307,9 +1334,9 @@ void spMatMultiVec_no_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); // // Treat the case y <- alpha * A * x + beta * y // @@ -1357,6 +1384,7 @@ template ()>::type * = nullptr> void spMatMultiVec_no_transpose( + const typename AD::execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -1415,15 +1443,15 @@ void spMatMultiVec_no_transpose( if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", policy, func); } else { @@ -1432,20 +1460,19 @@ void spMatMultiVec_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", policy, func); } } /* ******************* */ -template +template struct BSR_GEMM_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; @@ -1622,11 +1649,12 @@ struct BSR_GEMM_Transpose_Functor { /// \brief spMatMultiVec_transpose: version for CPU execution spaces /// (RangePolicy or trivial serial impl used) -template ()>::type * = nullptr> void spMatMultiVec_transpose( + const execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< @@ -1636,16 +1664,15 @@ void spMatMultiVec_transpose( // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(y, beta, y); + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); // // Treat the case y <- alpha * A^T * x + beta * y // typedef KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - typedef typename AMatrix_Internal::execution_space execution_space; bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule @@ -1657,19 +1684,20 @@ void spMatMultiVec_transpose( } } - BSR_GEMM_Transpose_Functor func( - alpha, A, x, y, useConjugate); + BSR_GEMM_Transpose_Functor + func(alpha, A, x, y, useConjugate); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::parallel_for( "KokkosSparse::bsr_spm_mv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); } else { Kokkos::parallel_for( "KokkosSparse::bsr_spm_mv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); } } @@ -1677,11 +1705,12 @@ void spMatMultiVec_transpose( // // spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> + execution_space>()>::type * = nullptr> void spMatMultiVec_transpose( + const execution_space &exec, const KokkosKernels::Experimental::Controls &controls, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { @@ -1689,9 +1718,10 @@ void spMatMultiVec_transpose( return; } - KokkosBlas::scal(y, beta, y); - - typedef typename AMatrix::execution_space execution_space; + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(exec, y, Kokkos::ArithTraits::zero()); + else if (beta != Kokkos::ArithTraits::one()) + KokkosBlas::scal(exec, y, beta, y); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule @@ -1732,16 +1762,16 @@ void spMatMultiVec_transpose( vector_length = std::stoi(controls.getParameter("vector length")); } - BSR_GEMM_Transpose_Functor func(alpha, A, x, y, - useConjugate); + BSR_GEMM_Transpose_Functor func( + alpha, A, x, y, useConjugate); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * @@ -1749,7 +1779,7 @@ void spMatMultiVec_transpose( else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * @@ -1758,11 +1788,11 @@ void spMatMultiVec_transpose( func); } else { Kokkos::TeamPolicy> - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * @@ -1770,7 +1800,7 @@ void spMatMultiVec_transpose( else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size( 0, Kokkos::PerTeam( block_dim * x.extent(1) * diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp new file mode 100644 index 0000000000..1c0d2fc361 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -0,0 +1,160 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP +#define KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP + +#include + +#include + +namespace KokkosSparse { +namespace Impl { + +/* One thread for each entry in the product multivector + + Each thread accumulates the partial products for its entry, and writes it + out. +*/ +template +class BsrSpmvV42NonTrans { + Alpha alpha_; + AMatrix a_; + XVector x_; + Beta beta_; + YVector y_; + + public: + BsrSpmvV42NonTrans(const Alpha &alpha, const AMatrix &a, const XVector &x, + const Beta &beta, const YVector &y) + : alpha_(alpha), a_(a), x_(x), beta_(beta), y_(y) {} + + template + KOKKOS_INLINE_FUNCTION void impl(const size_t k) const { + using a_ordinal_type = typename AMatrix::non_const_ordinal_type; + using a_size_type = typename AMatrix::non_const_size_type; + using y_value_type = typename YVector::non_const_value_type; + using const_block_type = typename AMatrix::const_block_type; + + const a_ordinal_type irhs = k / y_.extent(0); + const a_ordinal_type row = k % y_.extent(0); + + // scale by beta + if (0 == beta_) { + y_(row, irhs) = 0; // convert NaN to 0 + } else if (1 != beta_) { + y_(row, irhs) *= beta_; + } + + // for non-zero template instantiations, + // constant propagation should optimize divmod + a_ordinal_type blocksz; + if constexpr (0 == BLOCK_SIZE) { + blocksz = a_.blockDim(); + } else { + blocksz = BLOCK_SIZE; + } + + if (0 != alpha_) { + const a_ordinal_type blockRow = row / blocksz; + const a_ordinal_type lclrow = row % blocksz; + y_value_type accum = 0; + const a_size_type j_begin = a_.graph.row_map(blockRow); + const a_size_type j_end = a_.graph.row_map(blockRow + 1); + for (a_size_type j = j_begin; j < j_end; ++j) { + const_block_type b = a_.unmanaged_block_const(j); + const a_ordinal_type blockcol = a_.graph.entries(j); + const a_ordinal_type x_start = blockcol * blocksz; + + const auto x_lcl = Kokkos::subview( + x_, Kokkos::make_pair(x_start, x_start + blocksz), irhs); + for (a_ordinal_type i = 0; i < blocksz; ++i) { + accum += b(lclrow, i) * x_lcl(i); + } + } + y_(row, irhs) += alpha_ * accum; + } + } + + KOKKOS_INLINE_FUNCTION void operator()(const size_t k) const { + if (false) { + } + // clang-format off + else if ( 1 == a_.blockDim()) { impl< 1>(k); } + else if ( 2 == a_.blockDim()) { impl< 2>(k); } + else if ( 3 == a_.blockDim()) { impl< 3>(k); } + else if ( 4 == a_.blockDim()) { impl< 4>(k); } + else if ( 5 == a_.blockDim()) { impl< 5>(k); } + else if ( 6 == a_.blockDim()) { impl< 6>(k); } + else if ( 7 == a_.blockDim()) { impl< 7>(k); } + else if ( 8 == a_.blockDim()) { impl< 8>(k); } + else if ( 9 == a_.blockDim()) { impl< 9>(k); } + else if (10 == a_.blockDim()) { impl<10>(k); } + else if (11 == a_.blockDim()) { impl<11>(k); } + // clang-format on + else { + impl<0>(k); + } + } +}; + +template +void apply_v42(const typename AMatrix::execution_space &exec, + const Alpha &alpha, const AMatrix &a, const XVector &x, + const Beta &beta, const YVector &y) { + using execution_space = typename AMatrix::execution_space; + + Kokkos::RangePolicy policy(exec, 0, y.size()); + if constexpr (YVector::rank == 1) { +// lbv - 07/26/2023: +// with_unmanaged_t<...> required Kokkos 4.1.0, +// the content of this header will be guarded +// until v4.3.0 +#if KOKKOS_VERSION >= 40100 || defined(DOXY) + // Implementation expects a 2D view, so create an unmanaged 2D view + // with extent 1 in the second dimension + using Y2D = KokkosKernels::Impl::with_unmanaged_t>; + using X2D = KokkosKernels::Impl::with_unmanaged_t>; +#else + // Implementation expects a 2D view, so create an unmanaged 2D view + // with extent 1 in the second dimension + using Y2D = Kokkos::View< + typename YVector::value_type * [1], typename YVector::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; + using X2D = Kokkos::View< + typename XVector::value_type * [1], typename XVector::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits>; +#endif // KOKKOS_VERSION >= 40100 || defined(DOXY) + const Y2D yu(y.data(), y.extent(0), 1); + const X2D xu(x.data(), x.extent(0), 1); + BsrSpmvV42NonTrans op(alpha, a, xu, beta, yu); + Kokkos::parallel_for(policy, op); + } else { + BsrSpmvV42NonTrans op(alpha, a, x, beta, y); + Kokkos::parallel_for(policy, op); + } +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 678aaaa0c5..564100879e 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -25,6 +25,7 @@ #include "KokkosKernels_Error.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include +#include "KokkosSparse_spmv_bsrmatrix_impl_v42.hpp" #endif namespace KokkosSparse { @@ -32,16 +33,14 @@ namespace Experimental { namespace Impl { // default is no eti available -template +template struct spmv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; -template ::type>::value> + std::is_integral_v> struct spmv_mv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; @@ -50,38 +49,44 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { } // namespace Experimental } // namespace KokkosSparse -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_bsrmatrix_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_bsrmatrix_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_mv_bsrmatrix_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_bsrmatrix_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include which ETIs are available @@ -94,40 +99,34 @@ namespace Experimental { namespace Impl { // declaration -template ::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_bsrmatrix_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type YScalar; static void spmv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &x, const YScalar &beta, const YVector &y); }; // declaration -template ::type>::value, + std::is_integral_v, bool tpl_spec_avail = spmv_mv_bsrmatrix_tpl_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_mv_bsrmatrix_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_MV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type YScalar; static void spmv_mv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &x, const YScalar &beta, const YVector &y); @@ -136,40 +135,68 @@ struct SPMV_MV_BSRMATRIX { // actual implementations to be compiled #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPMV_BSRMATRIX +struct SPMV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type YScalar; static void spmv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &X, const YScalar &beta, const YVector &Y) { - // - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - bool useConjugate = (mode[0] == Conjugate[0]); - return Bsr::spMatVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } else if ((mode[0] == Transpose[0]) || - (mode[0] == ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == ConjugateTranspose[0]); - return Bsr::spMatVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); + const bool modeIsNoTrans = (mode[0] == NoTranspose[0]); + const bool modeIsConjugate = (mode[0] == Conjugate[0]); + const bool modeIsConjugateTrans = (mode[0] == ConjugateTranspose[0]); + const bool modeIsTrans = (mode[0] == Transpose[0]); + + // use V41 if requested + if (controls.getParameter("algorithm") == ALG_V41) { + if (modeIsNoTrans || modeIsConjugate) { + return Bsr::spMatVec_no_transpose(space, controls, alpha, A, X, beta, Y, + modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { + return Bsr::spMatVec_transpose(space, controls, alpha, A, X, beta, Y, + modeIsConjugateTrans); + } + } + + // use V42 if possible + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || + controls.getParameter("algorithm") == ALG_V42) { + if (modeIsNoTrans) { + ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); + return; + } + } + + // fall back to V41 all else fails + if (modeIsNoTrans || modeIsConjugate) { + return Bsr::spMatVec_no_transpose(space, controls, alpha, A, X, beta, Y, + modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { + return Bsr::spMatVec_transpose(space, controls, alpha, A, X, beta, Y, + modeIsConjugateTrans); + } + + { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " "; + ss << "Internal logic error: no applicable BsrMatrix SpMV implementation " + ". Please report this"; + throw std::runtime_error(ss.str()); } } }; -template -struct SPMV_MV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; enum class Method { @@ -185,31 +212,21 @@ struct SPMV_MV_BSRMATRIX::is_complex) method = Method::Fallback; - if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; - if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; - // can't use tensor cores outside GPU - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename AMatrix::execution_space>()) - method = Method::Fallback; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename XVector::execution_space>()) - method = Method::Fallback; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename YVector::execution_space>()) + if (!KokkosSparse::Experimental::Impl::TensorCoresAvailable< + ExecutionSpace, AMatrix, XVector, YVector>::value) { method = Method::Fallback; + } // can't use tensor cores unless mode is no-transpose if (mode[0] != KokkosSparse::NoTranspose[0]) method = Method::Fallback; #if KOKKOS_HALF_T_IS_FLOAT @@ -240,17 +257,17 @@ struct SPMV_MV_BSRMATRIX::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, + X, beta, Y); return; } case Precision::Double: { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, + A, X, beta, Y); return; } case Precision::Automatic: // fallthrough @@ -260,16 +277,14 @@ struct SPMV_MV_BSRMATRIX::value && std::is_same::value; if (operandsHalfHalfFloat) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher< + ExecutionSpace, AMatrix, half, XVector, half, YVector, float, + 16, 16, 16>::dispatch(space, alpha, A, X, beta, Y); return; } else { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher< + ExecutionSpace, AMatrix, double, XVector, double, YVector, + double, 8, 8, 4>::dispatch(space, alpha, A, X, beta, Y); return; } } @@ -282,51 +297,79 @@ struct SPMV_MV_BSRMATRIX::dispatch(alpha, A, - X, beta, - Y); + BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, + beta, Y); return; } } -#endif // KOKKOS_ARCH - - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - bool useConjugate = (mode[0] == Conjugate[0]); - return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } else if ((mode[0] == Transpose[0]) || - (mode[0] == ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == ConjugateTranspose[0]); - return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); +#endif // defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_AMPERE) + + const bool modeIsNoTrans = (mode[0] == NoTranspose[0]); + const bool modeIsConjugate = (mode[0] == Conjugate[0]); + const bool modeIsConjugateTrans = (mode[0] == ConjugateTranspose[0]); + const bool modeIsTrans = (mode[0] == Transpose[0]); + + // use V41 if requested + if (controls.getParameter("algorithm") == ALG_V41) { + if (modeIsNoTrans || modeIsConjugate) { + return Bsr::spMatMultiVec_no_transpose(space, controls, alpha, A, X, + beta, Y, modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { + return Bsr::spMatMultiVec_transpose(space, controls, alpha, A, X, beta, + Y, modeIsConjugateTrans); + } + } + + // use V42 if possible + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || + controls.getParameter("algorithm") == ALG_V42) { + if (modeIsNoTrans) { + ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); + return; + } + } + + // use V41 as the ultimate fallback + if (modeIsNoTrans || modeIsConjugate) { + return Bsr::spMatMultiVec_no_transpose(space, controls, alpha, A, X, beta, + Y, modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { + return Bsr::spMatMultiVec_transpose(space, controls, alpha, A, X, beta, Y, + modeIsConjugateTrans); + } + + { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " "; + ss << "Internal logic error: no applicable BsrMatrix SpMV implementation " + ". Please report this"; + throw std::runtime_error(ss.str()); } } }; -template -struct SPMV_MV_BSRMATRIX { - typedef BsrMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; static void spmv_mv_bsrmatrix( + const ExecutionSpace &space, const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &X, const YScalar &beta, const YVector &Y) { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - typedef SPMV_BSRMATRIX - impl_type; for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); - impl_type::spmv_bsrmatrix(controls, mode, alpha, A, x_j, beta, y_j); + typedef SPMV_BSRMATRIX + impl_type; + impl_type::spmv_bsrmatrix(space, controls, mode, alpha, A, x_j, beta, + y_j); } } }; @@ -339,68 +382,80 @@ struct SPMV_MV_BSRMATRIX, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; - -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_BSRMATRIX< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; // declare / instantiate the 2D MV version // Instantiate with A,x,y are all the requested Scalar type (no instantiation of // mixed-precision operands) -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - extern template struct SPMV_MV_BSRMATRIX< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; - -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_MV_BSRMATRIX< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_MV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; + +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_MV_BSRMATRIX< \ + EXEC_SPACE_TYPE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; #include diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 6a82977e02..4f90002a61 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -17,43 +17,26 @@ #ifndef KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ #define KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ +#include + #include "KokkosKernels_Controls.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" +#include "KokkosSparse_spmv_impl_merge.hpp" #include "KokkosKernels_Error.hpp" namespace KokkosSparse { namespace Impl { -template -struct GetCoeffView { - typedef Kokkos::View view_type; - typedef Kokkos::View - non_const_view_type; - static non_const_view_type get_view(const InputType in, const int size) { - non_const_view_type aview("CoeffView", size); - if (size > 0) Kokkos::deep_copy(aview, in); - return aview; - } -}; - -template -struct GetCoeffView, DeviceType> { - typedef Kokkos::View view_type; - static Kokkos::View get_view( - const Kokkos::View& in, int /*size*/) { - return in; - } -}; +constexpr const char* KOKKOSSPARSE_ALG_NATIVE_MERGE = "native-merge"; // This TransposeFunctor is functional, but not necessarily performant. -template +template struct SPMV_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -110,10 +93,9 @@ struct SPMV_Transpose_Functor { } }; -template +template struct SPMV_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -223,8 +205,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) - max_vector_length = 64; + if (std::is_same::value) max_vector_length = 64; #endif if (vector_length < 1) { @@ -268,17 +249,17 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, // spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or // trivial serial impl used) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_beta_no_transpose( + const execution_space& exec, const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::execution_space execution_space; if (A.numRows() <= static_cast(0)) { return; @@ -375,6 +356,7 @@ static void spmv_beta_no_transpose( (((uintptr_t)(const void*)(y.data()) % 64) == 0) && !conjugate) { // Note BMK: this case is typically not called in practice even for OpenMP, // since it requires row_block_offsets to have been computed in the graph. + // Also, as this is raw OpenMP the execution space instance is not used spmv_raw_openmp_no_transpose(alpha, A, x, beta, y); return; @@ -390,34 +372,34 @@ static void spmv_beta_no_transpose( use_static_schedule = true; } } - SPMV_Functor func(alpha, A, x, - beta, y, 1); + SPMV_Functor + func(alpha, A, x, beta, y, 1); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) Kokkos::parallel_for( "KokkosSparse::spmv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); else Kokkos::parallel_for( "KokkosSparse::spmv", Kokkos::RangePolicy>( - 0, A.numRows()), + exec, 0, A.numRows()), func); } // spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_beta_no_transpose( + const execution_space& exec, const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::execution_space execution_space; if (A.numRows() <= static_cast(0)) { return; @@ -453,8 +435,8 @@ static void spmv_beta_no_transpose( A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length); int64_t worksets = (y.extent(0) + rows_per_team - 1) / rows_per_team; - SPMV_Functor func( - alpha, A, x, beta, y, rows_per_team); + SPMV_Functor + func(alpha, A, x, beta, y, rows_per_team); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { Kokkos::TeamPolicy> @@ -462,11 +444,11 @@ static void spmv_beta_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::spmv", policy, func); } else { @@ -475,11 +457,11 @@ static void spmv_beta_no_transpose( if (team_size < 0) policy = Kokkos::TeamPolicy>( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); else policy = Kokkos::TeamPolicy>( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); Kokkos::parallel_for("KokkosSparse::spmv", policy, func); } @@ -487,17 +469,17 @@ static void spmv_beta_no_transpose( // spmv_beta_transpose: version for CPU execution spaces (RangePolicy or trivial // serial impl used) -template ()>::type* = nullptr> -static void spmv_beta_transpose(typename YVector::const_value_type& alpha, + execution_space>()>::type* = nullptr> +static void spmv_beta_transpose(const execution_space& exec, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { - using ordinal_type = typename AMatrix::non_const_ordinal_type; - using size_type = typename AMatrix::non_const_size_type; - using execution_space = typename AMatrix::execution_space; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows() <= static_cast(0)) { return; @@ -506,13 +488,13 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } #if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || \ defined(KOKKOS_ENABLE_THREADS) { - if (execution_space().concurrency() == 1) { + if (exec.concurrency() == 1) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; typedef Kokkos::ArithTraits ATV; @@ -567,25 +549,27 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, } #endif - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor + OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), + Kokkos::RangePolicy(exec, 0, nrow), OpType(alpha, A, x, y)); } // spmv_beta_transpose: version for GPU execution spaces (TeamPolicy used) -template ()>::type* = nullptr> -static void spmv_beta_transpose(typename YVector::const_value_type& alpha, + execution_space>()>::type* = nullptr> +static void spmv_beta_transpose(const execution_space& exec, + typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { - using ordinal_type = typename AMatrix::non_const_ordinal_type; - using size_type = typename AMatrix::non_const_size_type; - using execution_space = typename AMatrix::execution_space; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows() <= static_cast(0)) { return; @@ -594,7 +578,7 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } // Assuming that no row contains duplicate entries, NNZPerRow @@ -609,14 +593,15 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if (std::is_same::value) - max_vector_length = 64; + if (std::is_same::value) max_vector_length = 64; #endif while ((vector_length * 2 * 3 <= NNZPerRow) && (vector_length < max_vector_length)) vector_length *= 2; - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor + OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -625,50 +610,63 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::TeamPolicy(nteams, team_size, vector_length), - op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), + op); } -template -static void spmv_beta(const KokkosKernels::Experimental::Controls& controls, +template +static void spmv_beta(const execution_space& exec, + const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_beta_no_transpose( - controls, alpha, A, x, beta, y); + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { + SpmvMergeHierarchical::spmv( + exec, mode, alpha, A, x, beta, y); + } else { + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + } } else if (mode[0] == Conjugate[0]) { - spmv_beta_no_transpose( - controls, alpha, A, x, beta, y); + if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { + SpmvMergeHierarchical::spmv( + exec, mode, alpha, A, x, beta, y); + } else { + spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + } } else if (mode[0] == Transpose[0]) { - spmv_beta_transpose(alpha, A, x, - beta, y); + spmv_beta_transpose(exec, alpha, A, x, beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_beta_transpose(alpha, A, x, - beta, y); + spmv_beta_transpose(exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } // Functor for implementing transpose and conjugate transpose sparse // matrix-vector multiply with multivector (2-D View) input and // output. This functor works, but is not necessarily performant. -template +template struct SPMV_MV_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -771,10 +769,9 @@ struct SPMV_MV_Transpose_Functor { } }; -template +template struct SPMV_MV_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -1172,11 +1169,12 @@ struct SPMV_MV_LayoutLeft_Functor { // spmv_alpha_beta_mv_no_transpose: version for CPU execution spaces // (RangePolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1187,7 +1185,7 @@ static void spmv_alpha_beta_mv_no_transpose( } if (doalpha == 0) { if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } return; } else { @@ -1200,46 +1198,45 @@ static void spmv_alpha_beta_mv_no_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #endif // KOKKOS_FAST_COMPILE } } // spmv_alpha_beta_mv_no_transpose: version for GPU execution spaces // (TeamPolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1251,7 +1248,7 @@ static void spmv_alpha_beta_mv_no_transpose( } if (doalpha == 0) { if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } return; } else { @@ -1267,63 +1264,62 @@ static void spmv_alpha_beta_mv_no_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_LayoutLeft_Functor + typedef SPMV_MV_LayoutLeft_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } // spmv_alpha_beta_mv_transpose: version for CPU execution spaces (RangePolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1336,45 +1332,46 @@ static void spmv_alpha_beta_mv_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } if (doalpha != 0) { #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; OpType op(alpha, A, x, beta, y); const ordinal_type nrow = A.numRows(); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - Kokkos::parallel_for( - "KokkosSparse::spmv", - Kokkos::RangePolicy(0, nrow), op); + Kokkos::parallel_for("KokkosSparse::spmv", + Kokkos::RangePolicy(exec, 0, nrow), + op); #endif // KOKKOS_FAST_COMPILE } } // spmv_alpha_beta_mv_transpose: version for GPU execution spaces (TeamPolicy) -template ()>::type* = nullptr> + execution_space>()>::type* = nullptr> static void spmv_alpha_beta_mv_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1388,7 +1385,7 @@ static void spmv_alpha_beta_mv_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } if (doalpha != 0) { @@ -1406,79 +1403,88 @@ static void spmv_alpha_beta_mv_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; OpType op(alpha, A, x, beta, y); const ordinal_type nrow = A.numRows(); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Transpose_Functor + typedef SPMV_MV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); OpType op(alpha, A, x, beta, y); const ordinal_type rows_per_thread = - RowsPerThread(NNZPerRow); + RowsPerThread(NNZPerRow); const ordinal_type team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } -template +template static void spmv_alpha_beta_mv( - const char mode[], const typename YVector::non_const_value_type& alpha, - const AMatrix& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { + const execution_space& exec, const char mode[], + const typename YVector::non_const_value_type& alpha, const AMatrix& A, + const XVector& x, const typename YVector::non_const_value_type& beta, + const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_no_transpose(exec, alpha, A, x, + beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_no_transpose(exec, alpha, A, x, + beta, y); } else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_transpose(exec, alpha, A, x, + beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_transpose(exec, alpha, A, x, beta, + y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } -template -void spmv_alpha_mv(const char mode[], +template +void spmv_alpha_mv(const execution_space& exec, const char mode[], const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1487,17 +1493,17 @@ void spmv_alpha_mv(const char mode[], typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { - spmv_alpha_beta_mv(mode, alpha, A, x, - beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_alpha_beta_mv(mode, alpha, A, x, - beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_alpha_beta_mv(mode, alpha, A, - x, beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } else { - spmv_alpha_beta_mv(mode, alpha, A, x, - beta, y); + spmv_alpha_beta_mv( + exec, mode, alpha, A, x, beta, y); } } diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp new file mode 100644 index 0000000000..9329b8a097 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -0,0 +1,375 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP +#define KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP + +#include + +#include "KokkosKernels_Iota.hpp" +#include "KokkosKernels_AlwaysFalse.hpp" + +#include "KokkosSparse_merge_matrix.hpp" + +namespace KokkosSparse::Impl { + +/*! \brief Merge-based SpMV + Hierarchical GPU implementation + Each team uses MergePath search to find the non-zeros and rows it is + responsible for Each thread in the team similarly uses diagonal search within + the team to determine which entries it will be responsible for + The threads then atomically accumulate partial produces +*/ +template +struct SpmvMergeHierarchical { + using device_type = typename YVector::device_type; + using exec_space = ExecutionSpace; + using y_value_type = typename YVector::non_const_value_type; + using x_value_type = typename XVector::non_const_value_type; + using A_value_type = typename AMatrix::non_const_value_type; + using A_ordinal_type = typename AMatrix::non_const_ordinal_type; + using A_size_type = typename AMatrix::non_const_size_type; + using row_map_non_const_value_type = + typename AMatrix::row_map_type::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using team_member = typename policy_type::member_type; + + using um_row_map_type = + Kokkos::View>; + + using row_map_scratch_type = + Kokkos::View>; + + using iota_type = KokkosKernels::Impl::Iota; + + using DSR = typename KokkosSparse::Impl::MergeMatrixDiagonal< + um_row_map_type, iota_type>::position_type; + + using KAT = Kokkos::ArithTraits; + + // results of a lower-bound and upper-bound diagonal search + struct Chunk { + DSR lb; // lower bound + DSR ub; // upper bound + }; + + template + struct SpmvMergeImplFunctor { + SpmvMergeImplFunctor(const y_value_type& _alpha, const AMatrix& _A, + const XVector& _x, const YVector& _y, + const A_size_type pathLengthThreadChunk) + : alpha(_alpha), + A(_A), + x(_x), + y(_y), + pathLengthThreadChunk_(pathLengthThreadChunk) {} + + y_value_type alpha; + AMatrix A; + XVector x; + YVector y; + A_size_type pathLengthThreadChunk_; + + KOKKOS_INLINE_FUNCTION void operator()(const team_member& thread) const { + const A_size_type pathLengthTeamChunk = + thread.team_size() * pathLengthThreadChunk_; + + const A_size_type pathLength = A.numRows() + A.nnz(); + const A_size_type teamD = + thread.league_rank() * pathLengthTeamChunk; // diagonal + const A_size_type teamDEnd = + KOKKOSKERNELS_MACRO_MIN(teamD + pathLengthTeamChunk, pathLength); + + // iota(i) -> i + iota_type iota(A.nnz()); + + // remove leading 0 from row_map + um_row_map_type rowEnds(&A.graph.row_map(1), A.graph.row_map.size() - 1); + + // compiler thinks these are "used" in team_broadcast below, so initialize + // them with something to silence the warning + DSR lb{}; + DSR ub{}; + + // thread 0 does the lower bound, thread 1 does the upper bound + if (0 == thread.team_rank() || 1 == thread.team_rank()) { + const A_size_type d = thread.team_rank() ? teamDEnd : teamD; + DSR dsr = diagonal_search(rowEnds, iota, d); + if (0 == thread.team_rank()) { + lb = dsr; + } + if (1 == thread.team_rank()) { + ub = dsr; + } + } + thread.team_broadcast(lb, 0); + thread.team_broadcast(ub, 1); + const A_size_type teamNnzBegin = + lb.bi; // the first nnz this team will handle + const A_size_type teamNnzEnd = + ub.bi; // one-past the last nnz this team will handle + const A_ordinal_type teamRowBegin = + lb.ai; // <= the row than the first nnz is in + const A_ordinal_type teamRowEnd = + ub.ai; // >= the row than the last nnz is in + + // team-collaborative copy of matrix data into scratch + A_size_type* rowEndsS{nullptr}; + A_ordinal_type* entriesS{nullptr}; + A_value_type* valuesS{nullptr}; + y_value_type* yS{nullptr}; + + if constexpr (ROWENDS_USE_SCRATCH) { + rowEndsS = (A_size_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_size_type)); + + // teamRowEnd may be equal to the row the team's last nnz is in + // so in most cases we want to read it (teamRowEnd+1). However, + // however, guard against reading off the end of the view + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1), + [&](const A_ordinal_type& i) { + if (i < A.numRows()) { + rowEndsS[i - teamRowBegin] = rowEnds(i); + } else { + rowEndsS[i - teamRowBegin] = A.nnz(); + } + }); + } else { + (void)(rowEndsS == rowEndsS); // set but unused, expr has no effect + } + + if constexpr (NONZEROS_USE_SCRATCH) { + valuesS = (A_value_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_value_type)); + entriesS = (A_ordinal_type*)thread.team_shmem().get_shmem( + pathLengthTeamChunk * sizeof(A_ordinal_type)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamNnzBegin, teamNnzEnd), + [=](const A_ordinal_type& i) { + valuesS[i - teamNnzBegin] = A.values(i); + entriesS[i - teamNnzBegin] = A.graph.entries(i); + }); + } else { + (void)(entriesS == entriesS); // set but unused, expr has no effect + (void)(valuesS == valuesS); // set but unused, expr has no effect + } + + if constexpr (Y_USE_SCRATCH) { + yS = (y_value_type*)thread.team_shmem().get_shmem(pathLengthTeamChunk * + sizeof(y_value_type)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, teamRowBegin, teamRowEnd + 1), + [&](const A_ordinal_type& i) { + if (i < A.numRows()) { + yS[i - teamRowBegin] = 0; + } + }); + } else { + (void)(yS == yS); // set but unused, expr has no effect + } + + if constexpr (ROWENDS_USE_SCRATCH || NONZEROS_USE_SCRATCH || + Y_USE_SCRATCH) { + thread.team_barrier(); + } + + // each thread determines its location within the team chunk + + // team's view of row map is either in scratch or global + typename std::conditional::type teamRowEnds; + if constexpr (ROWENDS_USE_SCRATCH) { + teamRowEnds = row_map_scratch_type(rowEndsS, teamRowEnd - teamRowBegin); + } else { + teamRowEnds = + um_row_map_type(&rowEnds(teamRowBegin), teamRowEnd - teamRowBegin); + } + + iota_type teamIota(teamNnzEnd - teamNnzBegin, + teamNnzBegin); // teamNnzBegin.. teamRowBegin && i < teamRowEnd) { + y(i) += yS[i - teamRowBegin]; + } else { + Kokkos::atomic_add(&y(i), yS[i - teamRowBegin]); + } + } + }); + } + } + + size_t team_shmem_size(int teamSize) const { + const A_size_type pathLengthTeamChunk = pathLengthThreadChunk_ * teamSize; + (void)pathLengthTeamChunk; // silence declared but not referenced + size_t val = 0; + if constexpr (Y_USE_SCRATCH) { + val += sizeof(y_value_type) * pathLengthTeamChunk; + } + if constexpr (ROWENDS_USE_SCRATCH) { + val += sizeof(row_map_non_const_value_type) * pathLengthTeamChunk; + } + if constexpr (NONZEROS_USE_SCRATCH) { + val += sizeof(A_ordinal_type) * pathLengthTeamChunk; + val += sizeof(A_value_type) * pathLengthTeamChunk; + } + return val; + } + }; // struct SpmvMergeImplFunctor + + static void spmv(const ExecutionSpace& space, const char mode[], + const y_value_type& alpha, const AMatrix& A, + const XVector& x, const y_value_type& beta, + const YVector& y) { + static_assert(XVector::rank == 1, ""); + static_assert(YVector::rank == 1, ""); + + KokkosBlas::scal(y, beta, y); + + /* determine launch parameters for different architectures + On architectures where there is a natural execution hierarchy with true + team scratch, we'll assign each team to use an appropriate amount of the + scratch. + On other architectures, just have each team do the maximal amount of work + to amortize the cost of the diagonal search + */ + const A_size_type pathLength = A.numRows() + A.nnz(); + A_size_type pathLengthThreadChunk; + int teamSize; + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + pathLengthThreadChunk = 4; + teamSize = 128; + } else { + teamSize = 1; + pathLengthThreadChunk = (pathLength + exec_space().concurrency() - 1) / + exec_space().concurrency(); + } + + const size_t pathLengthTeamChunk = pathLengthThreadChunk * teamSize; + const int leagueSize = + (pathLength + pathLengthTeamChunk - 1) / pathLengthTeamChunk; + + policy_type policy(space, leagueSize, teamSize); + + /* Currently: + On GPU, assume atomics are fast, so don't accumuate into scratch. + On CPU spaces, there's no real point to using scratch, just rely on the + memory hierarchy. Using scratch just increases the number of required + atomic operations + */ + if (KokkosSparse::NoTranspose[0] == mode[0]) { + constexpr bool CONJ = false; + using GpuOp = SpmvMergeImplFunctor; + using CpuOp = SpmvMergeImplFunctor; + using Op = typename std::conditional< + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + CpuOp>::type; + Op op(alpha, A, x, y, pathLengthThreadChunk); + Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); + } else if (KokkosSparse::Conjugate[0] == mode[0]) { + constexpr bool CONJ = true; + using GpuOp = SpmvMergeImplFunctor; + using CpuOp = SpmvMergeImplFunctor; + using Op = typename std::conditional< + KokkosKernels::Impl::kk_is_gpu_exec_space(), GpuOp, + CpuOp>::type; + Op op(alpha, A, x, y, pathLengthThreadChunk); + Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); + } else { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ + << "SpmvMergeHierarchical::spmv() called with unsupported mode " + << mode; + throw std::logic_error(ss.str()); + } + } +}; + +} // namespace KokkosSparse::Impl + +#endif // KOKKOSSPARSE_SPMV_IMPL_MERGE_HPP diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index 95cd022159..8bd52088a5 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -30,15 +30,13 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_eti_spec_avail { enum : bool { value = false }; }; -template ::type>::value> + std::is_integral_v> struct spmv_mv_eti_spec_avail { enum : bool { value = false }; }; @@ -46,38 +44,44 @@ struct spmv_mv_eti_spec_avail { } // namespace Impl } // namespace KokkosSparse -#define KOKKOSSPARSE_SPMV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct spmv_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct spmv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct spmv_mv_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -94,33 +98,19 @@ namespace Impl { /// \brief Implementation of KokkosSparse::spmv (sparse matrix - dense /// vector multiply) for single vectors (1-D Views). /// -/// The first 5 template parameters are the same as those of -/// KokkosSparse::CrsMatrix. In particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The last 4 template parameters (that start -/// with Y) correspond to the output Kokkos::View. -/// /// For the implementation of KokkosSparse::spmv for multivectors (2-D /// Views), see the SPMV_MV struct below. -template ::value, - bool eti_spec_avail = spmv_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> +template < + class ExecutionSpace, class AMatrix, class XVector, class YVector, + bool tpl_spec_avail = + spmv_tpl_spec_avail::value, + bool eti_spec_avail = + spmv_eti_spec_avail::value> struct SPMV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; - typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& controls, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -146,39 +136,22 @@ struct SPMV { /// matrix, and Op(A) is either A itself, its transpose, or its /// conjugate transpose, depending on the 'mode' argument. /// -/// The first 5 template parameters are the template parameters of the -/// input 1-D View of coefficients 'alpha'. The next 5 template -/// parameters are the same as those of KokkosSparse::CrsMatrix. In -/// particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The 4 template parameters after that -/// (that start with lower-case b) are the template parameters of the -/// input 1-D View of coefficients 'beta'. Next, the 5 template -/// parameters that start with Y correspond to the output -/// Kokkos::View. The last template parameter indicates whether the +/// The last template parameter (integerScalarType) indicates whether the /// matrix's entries have integer type. Per Github Issue #700, we /// don't optimize as heavily for that case, in order to reduce build /// times and library sizes. -template ::type>::value, - bool tpl_spec_avail = spmv_mv_tpl_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, - bool eti_spec_avail = spmv_mv_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + std::is_integral_v, + bool tpl_spec_avail = spmv_mv_tpl_spec_avail::value, + bool eti_spec_avail = spmv_mv_eti_spec_avail::value> struct SPMV_MV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const KokkosKernels::Experimental::Controls& controls, + static void spmv_mv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& controls, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -187,16 +160,13 @@ struct SPMV_MV { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spmv for single vectors (1-D Views). // Unification layer -template -struct SPMV +struct SPMV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& controls, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { @@ -204,79 +174,76 @@ struct SPMV(controls, mode, alpha, A, x, beta, - y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_beta(controls, mode, alpha, A, x, beta, - y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_beta(controls, mode, alpha, A, x, - beta, y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } else { - spmv_beta(controls, mode, alpha, A, x, beta, - y); + spmv_beta( + space, controls, mode, alpha, A, x, beta, y); } } }; //! Full specialization of spmv_mv for single vectors (2-D Views). // Unification layer -template -struct SPMV_MV +struct SPMV_MV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& /*controls*/, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else if (alpha == KAT::one()) { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else if (alpha == -KAT::one()) { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } else { - spmv_alpha_mv(mode, alpha, A, x, beta, y); + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); } } }; -template -struct SPMV_MV +struct SPMV_MV { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, + const KokkosKernels::Experimental::Controls& /*controls*/, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - typedef SPMV - impl_type; KokkosKernels::Experimental::Controls defaultControls; for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); - impl_type::spmv(defaultControls, mode, alpha, A, x_j, beta, y_j); + typedef SPMV + impl_type; + impl_type::spmv(space, defaultControls, mode, alpha, A, x_j, beta, y_j); } } }; @@ -292,65 +259,77 @@ struct SPMV_MV, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SPMV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct SPMV< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SPMV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - extern template struct SPMV_MV< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SPMV_MV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct SPMV_MV< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SPMV_MV< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; #include diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 8f217e05aa..a582f18e40 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ +#include + #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas1_scal.hpp" @@ -29,10 +31,9 @@ namespace Impl { enum { FD, FE }; // This TransposeFunctor is functional, but not necessarily performant. -template +template struct SPMV_Struct_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -91,13 +92,12 @@ struct SPMV_Struct_Transpose_Functor { } }; -template +template struct SPMV_Struct_Functor { typedef typename AMatrix::non_const_size_type size_type; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type value_type; - typedef typename AMatrix::execution_space execution_space; typedef typename execution_space::scratch_memory_space scratch_space; typedef typename KokkosSparse::SparseRowViewConst row_view_const; typedef typename Kokkos::TeamPolicy team_policy; @@ -144,6 +144,7 @@ struct SPMV_Struct_Functor { beta(beta_), m_y(m_y_), stencil_type(stencil_type_), + numExterior(0), rows_per_team(rows_per_team_), rows_per_team_ext(rows_per_team_ext_) { static_assert(static_cast(XVector::rank) == 1, @@ -164,8 +165,8 @@ struct SPMV_Struct_Functor { } } - void compute_interior(const int64_t worksets, const int team_size, - const int vector_length) { + void compute_interior(const execution_space& exec, const int64_t worksets, + const int team_size, const int vector_length) { if (numDimensions == 1) { // Treat interior points using structured algorithm numInterior = ni - 2; @@ -173,16 +174,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(3); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -198,16 +199,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(5); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -217,16 +218,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(9); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -243,16 +244,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(7); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -262,16 +263,16 @@ struct SPMV_Struct_Functor { size_t shared_size = shared_ordinal_1d::shmem_size(27); Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length) + exec, worksets, Kokkos::AUTO, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length) + exec, worksets, team_size, vector_length) .set_scratch_size(0, Kokkos::PerTeam(shared_size)); } Kokkos::parallel_for( @@ -548,15 +549,15 @@ struct SPMV_Struct_Functor { }); } - void compute_exterior(const int64_t worksets, const int team_size, - const int vector_length) { + void compute_exterior(const execution_space& exec, const int64_t worksets, + const int team_size, const int vector_length) { // Treat exterior points using unstructured algorithm if (numDimensions == 1) { numExterior = 2; if (numExterior > 0) { Kokkos::RangePolicy > - policy(0, numExterior); + policy(exec, 0, numExterior); Kokkos::parallel_for( "KokkosSparse::spmv_struct: exterior", policy, *this); @@ -567,15 +568,15 @@ struct SPMV_Struct_Functor { if (numExterior > 0) { Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); } Kokkos::parallel_for( "KokkosSparse::spmv_struct: exterior", policy, @@ -587,15 +588,15 @@ struct SPMV_Struct_Functor { if (numExterior > 0) { Kokkos::TeamPolicy > - policy(1, 1); + policy(exec, 1, 1); if (team_size < 0) { policy = Kokkos::TeamPolicy >( - worksets, Kokkos::AUTO, vector_length); + exec, worksets, Kokkos::AUTO, vector_length); } else { policy = Kokkos::TeamPolicy >( - worksets, team_size, vector_length); + exec, worksets, team_size, vector_length); } Kokkos::parallel_for( @@ -773,17 +774,16 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, return rows_per_team; } // spmv_struct_launch_parameters -template +template static void spmv_struct_beta_no_transpose( - const int stencil_type, + const execution_space& exec, const int stencil_type, const Kokkos::View& structure, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { typedef typename AMatrix::ordinal_type ordinal_type; - typedef typename AMatrix::execution_space execution_space; if (A.numRows() <= static_cast(0)) { return; } @@ -833,18 +833,21 @@ static void spmv_struct_beta_no_transpose( int64_t worksets_exterior = (numExteriorPts + rows_per_team_ext - 1) / rows_per_team_ext; - SPMV_Struct_Functor spmv_struct( - structure, stencil_type, alpha, A, x, beta, y, rows_per_team_int, - rows_per_team_ext); + SPMV_Struct_Functor + spmv_struct(structure, stencil_type, alpha, A, x, beta, y, + rows_per_team_int, rows_per_team_ext); - spmv_struct.compute_interior(worksets_interior, team_size_int, vector_length); - spmv_struct.compute_exterior(worksets_exterior, team_size_ext, vector_length); + spmv_struct.compute_interior(exec, worksets_interior, team_size_int, + vector_length); + spmv_struct.compute_exterior(exec, worksets_exterior, team_size_ext, + vector_length); } // spmv_struct_beta_no_transpose -template +template static void spmv_struct_beta_transpose( - const int /*stencil_type*/, + const execution_space& exec, const int /*stencil_type*/, const Kokkos::View& /*structure*/, typename YVector::const_value_type& alpha, const AMatrix& A, @@ -859,7 +862,7 @@ static void spmv_struct_beta_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } typedef typename AMatrix::size_type size_type; @@ -875,62 +878,66 @@ static void spmv_struct_beta_transpose( (vector_length < 32)) vector_length *= 2; - typedef SPMV_Struct_Transpose_Functor + typedef SPMV_Struct_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow)); + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow)); - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); } -template +template static void spmv_struct_beta( - const char mode[], const int stencil_type, + const execution_space& exec, const char mode[], const int stencil_type, const Kokkos::View& structure, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_struct_beta_no_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_no_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_struct_beta_no_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_no_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else if (mode[0] == Transpose[0]) { - spmv_struct_beta_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_struct_beta_transpose( - stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta_transpose(exec, stencil_type, structure, + alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv_struct()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv_struct()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } // Functor for implementing transpose and conjugate transpose sparse // matrix-vector multiply with multivector (2-D View) input and // output. This functor works, but is not necessarily performant. -template +template struct SPMV_MV_Struct_Transpose_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -1007,10 +1014,9 @@ struct SPMV_MV_Struct_Transpose_Functor { } }; -template +template struct SPMV_MV_Struct_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; @@ -1245,9 +1251,10 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { } }; -template +template static void spmv_alpha_beta_mv_struct_no_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1258,7 +1265,7 @@ static void spmv_alpha_beta_mv_struct_no_transpose( } if (doalpha == 0) { if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } return; } else { @@ -1278,11 +1285,10 @@ static void spmv_alpha_beta_mv_struct_no_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_LayoutLeft_Functor + typedef SPMV_MV_Struct_LayoutLeft_Functor< + execution_space, AMatrix, XVector, YVector, doalpha, dobeta, conjugate> OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -1292,30 +1298,28 @@ static void spmv_alpha_beta_mv_struct_no_transpose( // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Struct_LayoutLeft_Functor + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow), + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow), vector_length); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here @@ -1323,25 +1327,25 @@ static void spmv_alpha_beta_mv_struct_no_transpose( // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } -template +template static void spmv_alpha_beta_mv_struct_transpose( + const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, const YVector& y) { @@ -1354,7 +1358,7 @@ static void spmv_alpha_beta_mv_struct_transpose( // We need to scale y first ("scaling" by zero just means filling // with zeros), since the functor works by atomic-adding into y. if (dobeta != 1) { - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(exec, y, beta, y); } if (doalpha != 0) { @@ -1374,11 +1378,10 @@ static void spmv_alpha_beta_mv_struct_transpose( #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and // dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_Transpose_Functor + typedef SPMV_MV_Struct_Transpose_Functor< + execution_space, AMatrix, XVector, YVector, doalpha, dobeta, conjugate> OpType; - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow)); + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow)); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -1387,78 +1390,84 @@ static void spmv_alpha_beta_mv_struct_transpose( // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for // alpha/beta - typedef SPMV_MV_Struct_Transpose_Functor + typedef SPMV_MV_Struct_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op(alpha, A, x, beta, y, - RowsPerThread(NNZPerRow)); + OpType op(alpha, A, x, beta, y, RowsPerThread(NNZPerRow)); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. // team_size is a hardware resource thing so it might legitimately // be int. - const int rows_per_thread = - RowsPerThread(NNZPerRow); + const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = - Kokkos::TeamPolicy( - rows_per_thread, Kokkos::AUTO, vector_length) + Kokkos::TeamPolicy(exec, rows_per_thread, Kokkos::AUTO, + vector_length) .team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; const size_type nteams = (nrow + rows_per_team - 1) / rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", - Kokkos::TeamPolicy( - nteams, team_size, vector_length), + Kokkos::TeamPolicy( + exec, nteams, team_size, vector_length), op); #endif // KOKKOS_FAST_COMPILE } } -template +template static void spmv_alpha_beta_mv_struct( - const char mode[], const typename YVector::non_const_value_type& alpha, - const AMatrix& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { + const execution_space& exec, const char mode[], + const typename YVector::non_const_value_type& alpha, const AMatrix& A, + const XVector& x, const typename YVector::non_const_value_type& beta, + const YVector& y) { if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_struct_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_no_transpose( + exec, alpha, A, x, beta, y); } else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_struct_no_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_no_transpose( + exec, alpha, A, x, beta, y); } else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_struct_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_transpose( + exec, alpha, A, x, beta, y); } else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_struct_transpose(alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct_transpose( + exec, alpha, A, x, beta, y); } else { - KokkosKernels::Impl::throw_runtime_exception( - "Invalid Transpose Mode for KokkosSparse::spmv()"); + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " Invalid transpose mode " << mode + << " for KokkosSparse::spmv_struct()"; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } } -template -void spmv_alpha_mv_struct(const char mode[], +template +void spmv_alpha_mv_struct(const execution_space& exec, const char mode[], const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1467,17 +1476,17 @@ void spmv_alpha_mv_struct(const char mode[], typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } else { - spmv_alpha_beta_mv_struct( - mode, alpha, A, x, beta, y); + spmv_alpha_beta_mv_struct(exec, mode, alpha, A, x, beta, y); } } diff --git a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index 9b22278db2..103bea8781 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -29,16 +29,14 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_struct_eti_spec_avail { enum : bool { value = false }; }; -template ::type>::value> + std::is_integral_v> struct spmv_mv_struct_eti_spec_avail { enum : bool { value = false }; }; @@ -46,38 +44,44 @@ struct spmv_mv_struct_eti_spec_avail { } // namespace Impl } // namespace KokkosSparse -#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_struct_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_struct_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_AVAIL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template <> \ - struct spmv_mv_struct_eti_spec_avail< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_struct_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -92,34 +96,18 @@ namespace Impl { /// \brief Implementation of KokkosSparse::spmv_struct (sparse structured matrix /// - dense vector multiply) for single vectors (1-D Views). /// -/// The first 5 template parameters are the same as those of -/// KokkosSparse::CrsMatrix. In particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The last 4 template parameters (that start -/// with Y) correspond to the output Kokkos::View. -/// /// For the implementation of KokkosSparse::spmv_struct for multivectors (2-D /// Views), see the SPMV_STRUCT struct below. -template ::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_struct_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; - typedef typename YVector::non_const_value_type coefficient_type; static void spmv_struct( - const char mode[], const int stencil_type, + const ExecutionSpace& space, const char mode[], const int stencil_type, const Kokkos::View& structure, const coefficient_type& alpha, const AMatrix& A, const XVector& x, @@ -146,57 +134,36 @@ struct SPMV_STRUCT { /// matrix, and Op(A) is either A itself, its transpose, or its /// conjugate transpose, depending on the 'mode' argument. /// -/// The first 5 template parameters are the template parameters of the -/// input 1-D View of coefficients 'alpha'. The next 5 template -/// parameters are the same as those of KokkosSparse::CrsMatrix. In -/// particular: -/// -/// AT: type of each entry of the sparse matrix -/// AO: ordinal type (type of column indices) of the sparse matrix -/// AS: offset type (type of row offsets) of the sparse matrix -/// -/// The next 4 template parameters (that start with X) correspond to -/// the input Kokkos::View. The 4 template parameters after that -/// (that start with lower-case b) are the template parameters of the -/// input 1-D View of coefficients 'beta'. Next, the 4 template -/// parameters that start with Y correspond to the output -/// Kokkos::View. The last template parameter indicates whether the +/// The last template parameter integerScalarType indicates whether the /// matrix's entries have integer type. Per Github Issue #700, we /// don't optimize as heavily for that case, in order to reduce build /// times and library sizes. -template ::type>::value, + std::is_integral_v, bool tpl_spec_avail = spmv_mv_struct_tpl_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, + ExecutionSpace, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_mv_struct_eti_spec_avail< - AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> + ExecutionSpace, AMatrix, XVector, YVector>::value> struct SPMV_MV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv_struct(const char mode[], const coefficient_type& alpha, - const AMatrix& A, const XVector& x, - const coefficient_type& beta, const YVector& y); + static void spmv_mv_struct(const ExecutionSpace& space, const char mode[], + const coefficient_type& alpha, const AMatrix& A, + const XVector& x, const coefficient_type& beta, + const YVector& y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spmv for single vectors (1-D Views). // Unification layer -template -struct SPMV_STRUCT +struct SPMV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; static void spmv_struct( - const char mode[], const int stencil_type, + const ExecutionSpace& space, const char mode[], const int stencil_type, const Kokkos::View& structure, const coefficient_type& alpha, const AMatrix& A, const XVector& x, @@ -207,80 +174,72 @@ struct SPMV_STRUCT( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_struct_beta( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_struct_beta( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } else { - spmv_struct_beta( - mode, stencil_type, structure, alpha, A, x, beta, y); + spmv_struct_beta( + space, mode, stencil_type, structure, alpha, A, x, beta, y); } } }; //! Full specialization of spmv_mv for single vectors (2-D Views). // Unification layer -template -struct SPMV_MV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_STRUCT { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv_struct(const char mode[], const coefficient_type& alpha, - const AMatrix& A, const XVector& x, - const coefficient_type& beta, const YVector& y) { + static void spmv_mv_struct(const ExecutionSpace& space, const char mode[], + const coefficient_type& alpha, const AMatrix& A, + const XVector& x, const coefficient_type& beta, + const YVector& y) { typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } else if (alpha == KAT::one()) { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } else if (alpha == -KAT::one()) { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } else { - spmv_alpha_mv_struct(mode, alpha, A, x, - beta, y); + spmv_alpha_mv_struct( + space, mode, alpha, A, x, beta, y); } } }; -template -struct SPMV_MV_STRUCT { - typedef CrsMatrix AMatrix; - typedef Kokkos::View XVector; - typedef Kokkos::View YVector; +template +struct SPMV_MV_STRUCT { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv_struct(const char mode[], const coefficient_type& alpha, - const AMatrix& A, const XVector& x, - const coefficient_type& beta, const YVector& y) { - static_assert(std::is_integral::value, + static void spmv_mv_struct(const ExecutionSpace& space, const char mode[], + const coefficient_type& alpha, const AMatrix& A, + const XVector& x, const coefficient_type& beta, + const YVector& y) { + static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - typedef SPMV_STRUCT - impl_type; + typedef SPMV_STRUCT impl_type; for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); - impl_type::spmv_struct(mode, alpha, A, x_j, beta, y_j); + impl_type::spmv_struct(space, mode, alpha, A, x_j, beta, y_j); } } }; @@ -296,65 +255,77 @@ struct SPMV_MV_STRUCT, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_STRUCT< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE*, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, false, true>; +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const*, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - extern template struct SPMV_MV_STRUCT< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_MV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST( \ - SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE) \ - template struct SPMV_MV_STRUCT< \ - const SCALAR_TYPE, const ORDINAL_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR_TYPE**, LAYOUT_TYPE, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - std::is_integral::type>::value, false, \ - true>; +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_MV_STRUCT< \ + EXEC_SPACE_TYPE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + Kokkos::View< \ + SCALAR_TYPE const**, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + std::is_integral_v, false, true>; #include diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index e2a625e2a7..ee7e83b554 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2734,8 +2734,8 @@ struct ReturnRangePolicyType { #endif #ifdef KOKKOS_ENABLE_HIP template <> -struct ReturnRangePolicyType { - using PolicyType = Kokkos::RangePolicy; +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; static inline PolicyType get_policy(int nt, int ts) { return PolicyType(nt, ts); @@ -2908,27 +2908,36 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, // Keep this a host View, create device version and copy to back to host // during scheduling This requires making sure the host view in the handle is // properly updated after the symbolic phase - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = thandle.get_host_nodes_per_level(); - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename execution_space::memory_space; + using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; using range_type = Kokkos::pair; + using row_map_host_view_t = Kokkos::View; + + row_map_host_view_t row_map_host; const scalar_t zero(0.0); const scalar_t one(1.0); - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - Kokkos::View row_map_host( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); + auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); + + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + + row_map_host = row_map_host_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); + Kokkos::deep_copy(row_map_host, row_map); + } // inversion options const bool invert_diagonal = thandle.get_invert_diagonal(); @@ -3289,23 +3298,30 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename execution_space::memory_space; + using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; + using range_type = Kokkos::pair; + using row_map_host_view_t = Kokkos::View; - using range_type = Kokkos::pair; + row_map_host_view_t row_map_host; const scalar_t zero(0.0); const scalar_t one(1.0); auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); - Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); - Kokkos::View row_map_host( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), - row_map.extent(0)); - Kokkos::deep_copy(row_map_host, row_map); + if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || + thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) { + Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level); + + row_map_host = row_map_host_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"), + row_map.extent(0)); + Kokkos::deep_copy(row_map_host, row_map); + } // supernode sizes const int *supercols = thandle.get_supercols(); diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 00fdcd2442..f1f7a0e6cd 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -633,22 +633,18 @@ class TwostageGaussSeidel { // shift ptr so that it now contains offsets (combine it with the previous // functor calls?) if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewL); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewLa); } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewU); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( 1 + num_rows, rowmap_viewUa); } } diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp index c600dad89a..594df031a3 100644 --- a/sparse/src/KokkosKernels_Controls.hpp +++ b/sparse/src/KokkosKernels_Controls.hpp @@ -20,7 +20,9 @@ /// \brief Mechanism to control internal behavior of kernels /// \author Luc Berger-Vergiat (lberge@sandia.gov) +#include #include +#include #include "KokkosKernels_config.h" #include "KokkosKernels_tpl_handles_decl.hpp" @@ -43,8 +45,13 @@ namespace Experimental { // Declaration of Controls class class Controls { public: + using key_type = std::string; + using mapped_type = std::string; + using value_type = std::pair; + // Constructor Controls() = default; + Controls(std::initializer_list init) : kernel_parameters(init) {} // set a new parameter void setParameter(const std::string& name, const std::string& value) { @@ -60,12 +67,10 @@ class Controls { /// /// \param name the name of the parameter to retrieve /// \param orUnset (default \c "" ) the value to return if \c name is not set - std::string getParameter(const std::string& name, - const std::string& orUnset = "") const { + key_type getParameter(const std::string& name, + const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); if (kernel_parameters.end() == search) { - std::cerr << "WARNING: Controls::getParameter for name \"" << name - << "\" was unset" << std::endl; return orUnset; } else { return search->second; @@ -125,7 +130,7 @@ class Controls { private: // storage for kernel parameters - std::unordered_map kernel_parameters; + std::unordered_map kernel_parameters; }; } // namespace Experimental diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index dae3f12462..d500f19d48 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -601,7 +601,36 @@ class KokkosKernelsHandle { "GS."); return cgs; } + + // clang-format off + /** + * @brief Create a gauss seidel handle object + * + * @param handle_exec_space The execution space instance to execute kernels on. + * @param num_streams The number of streams to allocate memory for. + * @param gs_algorithm Specifies which algorithm to use: + * + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) + */ + // clang-format on void create_gs_handle( + const HandleExecSpace &handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { @@ -610,10 +639,50 @@ class KokkosKernelsHandle { // ---------------------------------------- // // Two-stage Gauss-Seidel if (gs_algorithm == KokkosSparse::GS_TWOSTAGE) - this->gsHandle = new TwoStageGaussSeidelHandleType(); - else this->gsHandle = - new PointGaussSeidelHandleType(gs_algorithm, coloring_algorithm); + new TwoStageGaussSeidelHandleType(handle_exec_space, num_streams); + else + this->gsHandle = new PointGaussSeidelHandleType( + handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); + } + + // clang-format off + /** + * @brief Create a gauss seidel handle object + * + * @param gs_algorithm Specifies which algorithm to use: + * + * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. + * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. + * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. + * KokkosSpace::GS_CLUSTER Uses independent clusters of nodes in the graph. Within a cluster, x is updated sequentially. + * For more information, see: https://arxiv.org/pdf/2204.02934.pdf. + * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. + * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) + */ + // clang-format on + void create_gs_handle( + KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algorithm = + KokkosGraph::COLORING_DEFAULT) { + HandleExecSpace handle_exec_space; + return create_gs_handle(handle_exec_space, 1, gs_algorithm, + coloring_algorithm); } // ---------------------------------------- // // Two-stage Gauss-Seidel handle @@ -672,6 +741,31 @@ class KokkosKernelsHandle { gs2->setCompactForm(compact_form); } + // clang-format off + /** + * @brief Create a gs handle object + * + * @param clusterAlgo Specifies which clustering algorithm to use: + * + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * @param hint_verts_per_cluster Hint how many verticies to use per cluster + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) + */ + // clang-format on void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index a366245a86..e0d6e61a3b 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -393,10 +393,18 @@ class BsrMatrix { //! Nonconst version of the type of the entries in the sparse matrix. typedef typename values_type::non_const_value_type non_const_value_type; - // block values are actually a 1-D view, however they are implicitly - // arranged in LayoutRight, e.g. consecutive entries in the values view - // are consecutive entries within a row inside a block - using block_layout = Kokkos::LayoutRight; + //! block values are actually a 1-D view, however they are implicitly + //! arranged in LayoutRight, e.g. consecutive entries in the values view + //! are consecutive entries within a row inside a block + using block_layout_type = Kokkos::LayoutRight; + + //! Type returned by \c unmanaged_block + using block_type = Kokkos::View; + + //! Type returned by \c unmanaged_block_const + using const_block_type = Kokkos::View; /// \name Storage of the actual sparsity structure and values. /// @@ -455,7 +463,8 @@ class BsrMatrix { blockDim_(blockDimIn) { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } } @@ -480,21 +489,19 @@ class BsrMatrix { /// \param cols [in] The column indices. cols[k] is the column /// index of val[k]. /// \param blockdim [in] The block size of the constructed BsrMatrix. - /// \param pad [in] If true, pad the sparse matrix's storage with - /// zeros in order to improve cache alignment and / or - /// vectorization. + /// \param pad [in] Ignored /// /// The \c pad argument is currently not used. BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, size_type annz, ScalarType* vals, OrdinalType* rows, OrdinalType* cols, OrdinalType blockdim, bool pad = false) { - (void)label; (void)pad; blockDim_ = blockdim; if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -532,10 +539,10 @@ class BsrMatrix { // device data typename row_map_type::non_const_type row_map_device( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_device"), + Kokkos::view_alloc(Kokkos::WithoutInitializing, label + " row_map"), numRows + 1); - index_type entries_device("entries_device", numBlocks); - Kokkos::resize(values, annz); + index_type entries_device(label + " entries", numBlocks); + values = values_type(label + " values", annz); // mirror views on host auto row_map_host = Kokkos::create_mirror_view(row_map_device); @@ -656,7 +663,8 @@ class BsrMatrix { blockDim_(blockDimIn) { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -698,7 +706,8 @@ class BsrMatrix { : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) { if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } } @@ -719,7 +728,8 @@ class BsrMatrix { blockDim_ = blockDimIn; if (blockDim_ < 1) { std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + os << "KokkosSparse::Experimental::BsrMatrix: Inappropriate block size: " + << blockDim_; KokkosKernels::Impl::throw_runtime_exception(os.str()); } @@ -988,6 +998,20 @@ class BsrMatrix { } } + /*! \brief return an unmanaged view of block i */ + KOKKOS_INLINE_FUNCTION + block_type unmanaged_block(const size_type i) const { + // cast up to the size_type to help avoid an overflow + + return block_type(&values(i * blockDim_ * blockDim_), blockDim_, blockDim_); + } + KOKKOS_INLINE_FUNCTION + const_block_type unmanaged_block_const(const size_type i) const { + // cast up to the size_type to help avoid an overflow + return const_block_type(&values(i * blockDim_ * blockDim_), blockDim_, + blockDim_); + } + protected: enum class valueOperation { ADD, ASSIGN }; diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index be3ac80343..7070172a1f 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -63,7 +63,7 @@ inline int RowsPerThread(const int /*NNZPerRow*/) { #endif #ifdef KOKKOS_ENABLE_HIP template <> -inline int RowsPerThread(const int /*NNZPerRow*/) { +inline int RowsPerThread(const int /*NNZPerRow*/) { return 1; } #endif diff --git a/sparse/src/KokkosSparse_OrdinalTraits.hpp b/sparse/src/KokkosSparse_OrdinalTraits.hpp index 6d76460939..8a487de030 100644 --- a/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -55,44 +55,48 @@ struct OrdinalTraits { template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION short int invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION short int invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned short int invalid() { + static constexpr KOKKOS_INLINE_FUNCTION unsigned short int invalid() { return USHRT_MAX; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION int invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION int invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned int invalid() { return UINT_MAX; } + static constexpr KOKKOS_INLINE_FUNCTION unsigned int invalid() { + return UINT_MAX; + } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION long invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION long invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned long invalid() { return ULONG_MAX; } + static constexpr KOKKOS_INLINE_FUNCTION unsigned long invalid() { + return ULONG_MAX; + } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION long long invalid() { return -1; } + static constexpr KOKKOS_INLINE_FUNCTION long long invalid() { return -1; } }; template <> struct OrdinalTraits { - static KOKKOS_INLINE_FUNCTION unsigned long long invalid() { + static constexpr KOKKOS_INLINE_FUNCTION unsigned long long invalid() { return ULLONG_MAX; } }; diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 31b835d358..c26ace9c69 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -627,9 +627,14 @@ void sort_and_merge_matrix(const exec_space& exec, values_out = values_in; return; } + // Have to do the compression. Create a _shallow_ copy of the input + // to preserve it, in case the input and output views are identical + // references. + auto rowmap_orig = rowmap_in; + auto entries_orig = entries_in; + auto values_orig = values_in; // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( exec, numRows + 1, nc_rowmap_out); rowmap_out = nc_rowmap_out; entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, @@ -642,7 +647,7 @@ void sort_and_merge_matrix(const exec_space& exec, Kokkos::parallel_for( range_t(exec, 0, numRows), Impl::MatrixMergedEntriesFunctor( - rowmap_in, entries_in, values_in, rowmap_out, entries_out, + rowmap_orig, entries_orig, values_orig, rowmap_out, entries_out, values_out)); } @@ -746,12 +751,16 @@ void sort_and_merge_graph(const exec_space& exec, entries_out = entries_in; return; } + // Have to do the compression. Create a _shallow_ copy of the input + // to preserve it, in case the input and output views are identical + // references. + auto rowmap_orig = rowmap_in; + auto entries_orig = entries_in; // Prefix sum to get rowmap. // In the case where the output rowmap is the same as the input, we could just // assign "rowmap_out = rowmap_in" except that would break const-correctness. // Can skip filling the entries, however. - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( exec, numRows + 1, nc_rowmap_out); rowmap_out = nc_rowmap_out; entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, @@ -760,7 +769,7 @@ void sort_and_merge_graph(const exec_space& exec, // Compute merged entries and values Kokkos::parallel_for(range_t(exec, 0, numRows), Impl::GraphMergedEntriesFunctor( - rowmap_in, entries_in, rowmap_out, entries_out)); + rowmap_orig, entries_orig, rowmap_out, entries_out)); } template diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 4039b6f5a7..f3fbec1836 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -412,8 +412,7 @@ void transpose_matrix( team_size, thread_size), tm); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -497,8 +496,7 @@ void transpose_graph( team_size, thread_size), tm); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -802,8 +800,7 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(tmp_reverse_size + 1, tmp_color_xadj); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -838,8 +835,7 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(num_reverse_elements + 1, reverse_map_xadj); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_reverse_elements + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -1500,8 +1496,7 @@ crstmat_t kk_get_lower_triangle( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); exec_space().fence(); @@ -1558,8 +1553,7 @@ crstmat_t kk_get_lower_crs_matrix( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); exec_space().fence(); @@ -1612,8 +1606,7 @@ graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix, kk_get_lower_triangle_count( nr, ne, rowmap, entries, new_row_map.data(), new_indices); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nr + 1, new_row_map); exec_space().fence(); @@ -1666,8 +1659,7 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(), use_dynamic_scheduling, chunksize, is_lower); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); exec_space().fence(); @@ -1775,8 +1767,7 @@ void kk_create_incidence_matrix_from_original_matrix( permutation.data(), use_dynamic_scheduling, chunksize, sort_decreasing_order); exec_space().fence(); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); // kk_print_1Dview(out_rowmap, false, 20); @@ -1885,17 +1876,27 @@ struct ReduceLargerRowCount { template void kk_reduce_numrows_larger_than_threshold( - size_t num_elements, view_type view_to_reduce, - typename view_type::const_value_type threshold, + const MyExecSpace &my_exec_space, size_t num_elements, + view_type view_to_reduce, typename view_type::const_value_type threshold, typename view_type::non_const_value_type &sum_reduction) { - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), ReduceLargerRowCount(view_to_reduce, threshold), sum_reduction); } +template +void kk_reduce_numrows_larger_than_threshold( + size_t num_elements, view_type view_to_reduce, + typename view_type::const_value_type threshold, + typename view_type::non_const_value_type &sum_reduction) { + MyExecSpace my_exec_space; + kk_reduce_numrows_larger_than_threshold( + my_exec_space, num_elements, view_to_reduce, threshold, sum_reduction); +} + // Note: "block" in member name means it's block internal - otherwise it // addresses sparse rows/columns (whole blocks) within whole matrix. template @@ -2330,6 +2331,217 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } } +/** + * @brief Count the non-zeros of a sub-block in a CRS matrix and find the first + * and last column indices at each row of the sub-block. This is a host function + * used by the kk_extract_diagonal_blocks_crsmatrix_sequential() + */ +template +void kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( + const row_map_type &A_row_map, const entries_type &A_entries, + const ordinal_type &blk_row_start, const ordinal_type &blk_col_start, + const ordinal_type &blk_nrows, const ordinal_type &blk_ncols, + size_type &blk_nnz, offset_view1d_type &first_indices, + offset_view1d_type &last_indices) { + // Rowmap of i-th row-oriented sub-matrix + auto A_row_map_sub = Kokkos::subview( + A_row_map, + Kokkos::make_pair(blk_row_start, blk_row_start + blk_nrows + 1)); + + blk_nnz = 0; + + for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row + size_type k1 = A_row_map_sub(j); + size_type k2 = A_row_map_sub(j + 1); + size_type k; + // Assume column indices are sorted in ascending order + // Find the position of the start column in the row + for (k = k1; k < k2; k++) { + ordinal_type col = A_entries(k); + if (col >= blk_col_start) { + break; + } + } + first_indices(j) = k; + // Find the position of the last column in the row + for (k = k2 - 1; k >= k1; k--) { + ordinal_type col = A_entries(k); + if (col < blk_col_start + blk_ncols) { + break; + } + } + last_indices(j) = k; + blk_nnz += (last_indices(j) - first_indices(j) + 1); + } +} + +/** + * @brief Extract a CRS sub-block from a CRS matrix + * This is a host function used by the + * kk_extract_diagonal_blocks_crsmatrix_sequential() + */ +template +void kk_extract_subblock_crsmatrix_sequential( + const entries_type &A_entries, const values_type &A_values, + const ordinal_type &blk_col_start, const ordinal_type &blk_nrows, + const size_type &blk_nnz, const offset_view1d_type &first_indices, + const offset_view1d_type &last_indices, out_row_map_type &blk_row_map, + out_entries_type &blk_entries, out_values_type &blk_values) { + // - create out_row_map + // - copy A_entries to out_entries and update out_entries with local column + // indices + // - copy A_values to out_values + size_type first_ = 0; + for (ordinal_type j = 0; j < blk_nrows; j++) { // loop through each row + size_type nnz = last_indices(j) - first_indices(j) + 1; + blk_row_map(j) = first_; + for (size_type k = 0; k < nnz; k++) { + blk_entries(first_ + k) = A_entries(first_indices(j) + k) - blk_col_start; + blk_values(first_ + k) = A_values(first_indices(j) + k); + } + first_ += nnz; + } + blk_row_map(blk_nrows) = blk_nnz; // last element +} + +/** + * @brief Extract the diagonal blocks out of a crs matrix. + * This is a blocking function that runs on the host. + * + * @tparam crsMat_t The type of the CRS matrix. + * @param A [in] The square CrsMatrix. It is expected that column indices are + * in ascending order + * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks + * (1 <= the number of diagonal blocks <= A_nrows) + * + * Usage Example: + * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); + */ +template +void kk_extract_diagonal_blocks_crsmatrix_sequential( + const crsMat_t &A, std::vector &DiagBlk_v) { + using row_map_type = typename crsMat_t::row_map_type; + using entries_type = typename crsMat_t::index_type; + using values_type = typename crsMat_t::values_type; + using graph_t = typename crsMat_t::StaticCrsGraphType; + using out_row_map_type = typename graph_t::row_map_type::non_const_type; + using out_entries_type = typename graph_t::entries_type::non_const_type; + using out_values_type = typename crsMat_t::values_type::non_const_type; + using out_row_map_hostmirror_type = typename out_row_map_type::HostMirror; + using out_entries_hostmirror_type = typename out_entries_type::HostMirror; + using out_values_hostmirror_type = typename out_values_type::HostMirror; + + using ordinal_type = typename crsMat_t::non_const_ordinal_type; + using size_type = typename crsMat_t::non_const_size_type; + using offset_view1d_type = + Kokkos::View; + + row_map_type A_row_map = A.graph.row_map; + entries_type A_entries = A.graph.entries; + values_type A_values = A.values; + + auto A_row_map_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_row_map); + auto A_entries_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_entries); + auto A_values_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_values); + + ordinal_type A_nrows = static_cast(A.numRows()); + ordinal_type A_ncols = static_cast(A.numCols()); + ordinal_type n_blocks = static_cast(DiagBlk_v.size()); + + if (A_nrows != A_ncols) { + std::ostringstream os; + os << "The diagonal block extraction only works with square matrices -- " + "matrix A: " + << A_nrows << " x " << A_ncols; + throw std::runtime_error(os.str()); + } + + if (n_blocks == 1) { + // One block case: simply shallow copy A to DiagBlk_v[0] + DiagBlk_v[0] = crsMat_t(A); + } else { + // n_blocks > 1 + if (A_nrows == 0) { + // Degenerate case: A is an empty matrix + for (ordinal_type i = 0; i < n_blocks; i++) { + DiagBlk_v[i] = crsMat_t(); + } + } else { + // A_nrows >= 1 + if ((n_blocks < 1) || (A_nrows < n_blocks)) { + std::ostringstream os; + os << "The number of diagonal blocks (" << n_blocks + << ") should be >=1 and <= the number of rows of the matrix A (" + << A_nrows << ")"; + throw std::runtime_error(os.str()); + } + + ordinal_type rows_per_block = ((A_nrows % n_blocks) == 0) + ? (A_nrows / n_blocks) + : (A_nrows / n_blocks + 1); + + std::vector row_map_v(n_blocks); + std::vector entries_v(n_blocks); + std::vector values_v(n_blocks); + std::vector row_map_h_v(n_blocks); + std::vector entries_h_v(n_blocks); + std::vector values_h_v(n_blocks); + + ordinal_type blk_row_start = 0; // first row index of i-th diagonal block + ordinal_type blk_col_start = 0; // first col index of i-th diagonal block + ordinal_type blk_nrows, blk_ncols; // Nrows, Ncols of i-th diagonal block + + for (ordinal_type i = 0; i < n_blocks; i++) { + blk_nrows = rows_per_block; + if ((blk_row_start + rows_per_block) > A_nrows) { + blk_nrows = A_nrows - blk_row_start; + } + blk_col_start = blk_row_start; + blk_ncols = blk_nrows; + + // First round: count i-th non-zeros or size of entries_v[i] and find + // the first and last column indices at each row + size_type blk_nnz = 0; + offset_view1d_type first("first", blk_nrows); // first position per row + offset_view1d_type last("last", blk_nrows); // last position per row + + kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( + A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, + blk_ncols, blk_nnz, first, last); + + // Second round: extract + row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); + entries_v[i] = out_entries_type("entries_v", blk_nnz); + values_v[i] = out_values_type("values_v", blk_nnz); + row_map_h_v[i] = + out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); + entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); + values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); + + kk_extract_subblock_crsmatrix_sequential( + A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, + last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); + + Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); + Kokkos::deep_copy(entries_v[i], entries_h_v[i]); + Kokkos::deep_copy(values_v[i], values_h_v[i]); + + DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz, + values_v[i], row_map_v[i], entries_v[i]); + + blk_row_start += blk_nrows; + } // for (ordinal_type i = 0; i < n_blocks; i++) + } // A_nrows >= 1 + } // n_blocks > 1 +} + } // namespace Impl using Impl::isCrsGraphSorted; diff --git a/sparse/src/KokkosSparse_Utils_cusparse.hpp b/sparse/src/KokkosSparse_Utils_cusparse.hpp index 65f6ac9689..55e7144dba 100644 --- a/sparse/src/KokkosSparse_Utils_cusparse.hpp +++ b/sparse/src/KokkosSparse_Utils_cusparse.hpp @@ -168,6 +168,23 @@ inline cusparseIndexType_t cusparse_index_type_t_from() { } #endif +// Set the stream on the given cuSPARSE handle when this object +// is constructed, and reset to the default stream when this object is +// destructed. +struct TemporarySetCusparseStream { + TemporarySetCusparseStream(cusparseHandle_t handle_, + const Kokkos::Cuda& exec_) + : handle(handle_) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(handle, exec_.cuda_stream())); + } + + ~TemporarySetCusparseStream() { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(handle, NULL)); + } + + cusparseHandle_t handle; +}; + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_Utils_mkl.hpp b/sparse/src/KokkosSparse_Utils_mkl.hpp index 0afa75de0a..7a8dd0cb22 100644 --- a/sparse/src/KokkosSparse_Utils_mkl.hpp +++ b/sparse/src/KokkosSparse_Utils_mkl.hpp @@ -230,6 +230,15 @@ inline void MKLSparseMatrix>::export_data( } // namespace Impl } // namespace KokkosSparse +// Utilities for oneMKL SYCL code +#ifdef KOKKOS_ENABLE_SYCL +#include "oneapi/mkl/spblas.hpp" + +namespace KokkosSparse { +namespace Impl {} +} // namespace KokkosSparse +#endif // KOKKOS_ENABLE_SYCL + #endif // KOKKOSKERNELS_ENABLE_TPL_MKL #endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index e263dfd0fa..cc34e55093 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -18,6 +18,7 @@ #define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP #include +#include #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #include @@ -101,8 +102,9 @@ inline rocsparse_operation mode_kk_to_rocsparse(const char kk_mode[]) { myRocsparseOperation = rocsparse_operation_conjugate_transpose; break; default: { - std::cerr << "Mode " << kk_mode[0] << " invalid for rocSPARSE SpMV.\n"; - throw std::invalid_argument("Invalid mode"); + std::ostringstream out; + out << "Mode " << kk_mode[0] << " invalid for rocSPARSE SpMV.\n"; + throw std::invalid_argument(out.str()); } } return myRocsparseOperation; @@ -178,6 +180,24 @@ struct kokkos_to_rocsparse_type> { #define KOKKOSSPARSE_IMPL_ROCM_VERSION \ ROCM_VERSION_MAJOR * 10000 + ROCM_VERSION_MINOR * 100 + ROCM_VERSION_PATCH +// Set the stream on the given rocSPARSE handle when this object +// is constructed, and reset to the default stream when this object is +// destructed. +struct TemporarySetRocsparseStream { + TemporarySetRocsparseStream(rocsparse_handle handle_, + const Kokkos::HIP& exec_) + : handle(handle_) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_set_stream(handle, exec_.hip_stream())); + } + + ~TemporarySetRocsparseStream() { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(handle, NULL)); + } + + rocsparse_handle handle; +}; + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 9f1b9d8cb1..036fe1b119 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,10 +29,12 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -42,9 +44,9 @@ namespace Experimental { /// num_rows submatrix of A is structurally symmetric /// @pre handle->create_gs_handle(...) has been called previously /// -template -void gauss_seidel_symbolic(KernelHandle *handle, +template +void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -95,13 +97,42 @@ void gauss_seidel_symbolic(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_SYMBOLIC< - const_handle_type, Internal_alno_row_view_t_, - Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(&tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, + ExecutionSpace, const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity +/// pattern only) +/// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously +/// +template +void gauss_seidel_symbolic(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_symbolic(my_exec_space, handle, num_rows, num_cols, row_map, + entries, is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (first phase, based on /// sparsity pattern only) @@ -142,12 +173,14 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -157,11 +190,12 @@ void block_gauss_seidel_symbolic( /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template -void gauss_seidel_numeric(KernelHandle *handle, +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -225,11 +259,12 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, is_graph_symmetric); } @@ -251,7 +286,6 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param row_map The matrix's rowmap /// @param entries The matrix's entries /// @param values The matrix's values -/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// @remark If the inverse diagonal is not already available, it's best to call @@ -263,6 +297,50 @@ template void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + is_graph_symmetric); +} + +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param space The execution space instance this kernel will be run on. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -329,15 +407,57 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, - const_a_d, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, const_a_d, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + scalar_nnz_view_t_ given_inverse_diagonal, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + given_inverse_diagonal, is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (second phase, based on /// matrix's numeric values) @@ -384,6 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -394,6 +515,8 @@ void block_gauss_seidel_numeric( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param space The execution space instance this kernel will be run +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -410,13 +533,15 @@ void block_gauss_seidel_numeric( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void symmetric_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecutionSpace &space, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -532,13 +657,63 @@ void symmetric_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, true); +} + +/// +/// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to +/// system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle handle A KokkosKernelsHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run (forward and backward counts as 1) +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void symmetric_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + symmetric_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -621,6 +796,8 @@ void symmetric_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param space The execution space instance this kernel will be run +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -637,13 +814,15 @@ void symmetric_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void forward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecutionSpace &space, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -761,13 +940,62 @@ void forward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, false); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, false); +} + +/// +/// @brief Apply forward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void forward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + forward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -839,6 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -849,6 +1078,8 @@ void forward_sweep_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param space The execution space instance this kernel will be run +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -865,13 +1096,15 @@ void forward_sweep_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void backward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecutionSpace &space, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -989,13 +1222,62 @@ void backward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, false, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + false, true); +} + +/// +/// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void backward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + backward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 412985df72..649229918d 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -84,6 +84,9 @@ class GaussSeidelHandle { nnz_lno_persistent_work_host_view_t; // Host view type protected: + HandleExecSpace execution_space; + int num_streams; + GSAlgorithm algorithm_type; nnz_lno_persistent_work_host_view_t color_xadj; @@ -101,7 +104,22 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : algorithm_type(gs), + : execution_space(HandleExecSpace()), + num_streams(1), + algorithm_type(gs), + color_xadj(), + color_adj(), + numColors(0), + called_symbolic(false), + called_numeric(false), + suggested_vector_size(0), + suggested_team_size(0) {} + + GaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs) + : execution_space(handle_exec_space), + num_streams(n_streams), + algorithm_type(gs), color_xadj(), color_adj(), numColors(0), @@ -113,6 +131,10 @@ class GaussSeidelHandle { virtual ~GaussSeidelHandle() = default; // getters + int get_num_streams() const { return num_streams; } + + HandleExecSpace get_execution_space() const { return this->execution_space; } + GSAlgorithm get_algorithm_type() const { return this->algorithm_type; } nnz_lno_persistent_work_host_view_t get_color_xadj() const { @@ -126,7 +148,24 @@ class GaussSeidelHandle { bool is_symbolic_called() const { return this->called_symbolic; } bool is_numeric_called() const { return this->called_numeric; } - // setters + template + void set_execution_space(const ExecSpaceIn exec_space_in) { + static bool is_set = false; + if (!is_set) { + static_assert(std::is_same::value, + "The type of exec_space_in should be the same as " + "GaussSeidelHandle::HandleExecSpace"); + this->execution_space = exec_space_in; + } else { + if (exec_space_in != this->execution_space) + throw std::runtime_error( + "Gauss Seidel cannot be called on different execution spaces " + "without multiple handles. Please create a new handle via " + "create_gs_handle.\n"); + } + is_set = true; + } + void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; @@ -244,10 +283,10 @@ class PointGaussSeidelHandle /** * \brief Default constructor. */ - PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + PointGaussSeidelHandle(GSHandle gs_handle, KokkosGraph::ColoringAlgorithm coloring_algo_ = KokkosGraph::COLORING_DEFAULT) - : GSHandle(gs), + : GSHandle(gs_handle), permuted_xadj(), permuted_adj(), permuted_adj_vals(), @@ -263,9 +302,22 @@ class PointGaussSeidelHandle level_2_mem(0), long_row_threshold(0), coloring_algo(coloring_algo_) { - if (gs == GS_DEFAULT) this->choose_default_algorithm(); + if (gs_handle.get_algorithm_type() == GS_DEFAULT) + this->choose_default_algorithm(); } + PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(gs), coloring_algo_) {} + + PointGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(handle_exec_space, n_streams, gs), + coloring_algo_) {} + void set_block_size(nnz_lno_t bs) { this->block_size = bs; } nnz_lno_t get_block_size() const { return this->block_size; } @@ -613,8 +665,15 @@ class TwoStageGaussSeidelHandle ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>; - TwoStageGaussSeidelHandle() - : GSHandle(GS_TWOSTAGE), + using HandleExecSpace = typename GSHandle::HandleExecSpace; + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param gs_handle The GaussSeidel handle. + */ + TwoStageGaussSeidelHandle(GSHandle gs_handle) + : GSHandle(gs_handle), nrows(0), nrhs(1), direction(GS_SYMMETRIC), @@ -626,6 +685,23 @@ class TwoStageGaussSeidelHandle inner_omega = one; } + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + */ + TwoStageGaussSeidelHandle() + : TwoStageGaussSeidelHandle(GSHandle(GS_TWOSTAGE)) {} + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param handle_exec_space The execution space instance + * @param n_streams the number of streams + */ + TwoStageGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams) + : TwoStageGaussSeidelHandle( + GSHandle(handle_exec_space, n_streams, GS_TWOSTAGE)) {} + // Sweep direction void setSweepDirection(GSDirection direction_) { this->direction = direction_; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 1c5216bfe5..ee8139d6ac 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -27,6 +27,7 @@ #ifndef KOKKOSSPARSE_MDF_HPP_ #define KOKKOSSPARSE_MDF_HPP_ +#include #include "KokkosSparse_mdf_handle.hpp" #include "KokkosSparse_mdf_impl.hpp" @@ -35,22 +36,20 @@ namespace Experimental { template void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; - using execution_space = typename crs_matrix_type::execution_space; - using range_policy_type = Kokkos::RangePolicy; + using execution_space = typename crs_matrix_type::execution_space; + using team_range_policy_type = Kokkos::TeamPolicy; // Symbolic phase: // compute transpose of A for easy access to columns of A // allocate temporaries // allocate L and U size_type nnzL = 0, nnzU = 0; - range_policy_type setupPolicy(0, A.numRows()); + team_range_policy_type setupPolicy(A.numRows(), Kokkos::AUTO); KokkosSparse::Impl::MDF_count_lower compute_nnzL( A, handle.permutation, handle.permutation_inv); - Kokkos::parallel_reduce(range_policy_type(0, A.numRows()), compute_nnzL, - nnzL); + Kokkos::parallel_reduce(setupPolicy, compute_nnzL, nnzL); nnzU = A.nnz() - nnzL + A.numRows(); handle.allocate_data(nnzL, nnzU); @@ -62,17 +61,40 @@ void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { return; } // mdf_symbolic +template +void mdf_print_joined_view( + const view_t& dev_view, const char* sep, + ordinal_t max_count = Kokkos::ArithTraits::max()) { + const auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dev_view); + + max_count = max_count > (ordinal_t)host_view.extent(0) + ? (ordinal_t)host_view.extent(0) + : max_count; + for (ordinal_t i = 0; i < max_count; ++i) { + if (i) printf("%s", sep); + printf("%g", static_cast(host_view[i])); + } +} + template void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; + using scalar_mag_type = + typename KokkosSparse::Impl::MDF_types::scalar_mag_type; using values_mag_type = typename KokkosSparse::Impl::MDF_types::values_mag_type; using ordinal_type = typename crs_matrix_type::ordinal_type; using value_mag_type = typename values_mag_type::value_type; + using device_type = typename crs_matrix_type::device_type; using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; + using team_range_policy_type = Kokkos::TeamPolicy; + + using permutation_set_type = + Kokkos::UnorderedMap; // Numerical phase: // loop over rows @@ -85,60 +107,104 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { KokkosSparse::sort_crs_matrix(At); values_mag_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); - col_ind_type update_list_length("update list length", 1); - typename col_ind_type::HostMirror update_list_length_host = - Kokkos::create_mirror_view(update_list_length); + ordinal_type update_list_len = 0; col_ind_type update_list("update list", A.numRows()); col_ind_type factored("factored rows", A.numRows()); Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); + permutation_set_type permutation_set(A.numRows()); - KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( - Atmp, At, 0, handle.permutation, discarded_fill, deficiency, - verbosity_level); - Kokkos::parallel_for("MDF: initial fill computation", - range_policy_type(0, Atmp.numRows()), MDF_df_norm); + KokkosSparse::Impl::MDF_discarded_fill_norm + MDF_df_norm(Atmp, At, 0, handle.permutation, permutation_set, + discarded_fill, deficiency, verbosity_level); + Kokkos::parallel_for( + "MDF: initial fill computation", + team_range_policy_type(Atmp.numRows(), Kokkos::AUTO, Kokkos::AUTO), + MDF_df_norm); for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { if (verbosity_level > 0) { - printf("\n\nFactorization step %d\n\n", + printf("\n\nFactorization step %d\n", static_cast(factorization_step)); } - Kokkos::deep_copy(update_list_length_host, update_list_length); - range_policy_type updatePolicy(0, update_list_length_host(0)); - KokkosSparse::Impl::MDF_selective_discarded_fill_norm - MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, - update_list, discarded_fill, deficiency, - verbosity_level); - Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, - MDF_update_df_norm); + if (update_list_len > 0) { + team_range_policy_type updatePolicy(update_list_len, Kokkos::AUTO, + Kokkos::AUTO); + KokkosSparse::Impl::MDF_discarded_fill_norm + MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, + permutation_set, discarded_fill, deficiency, + verbosity_level, update_list); + Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, + MDF_update_df_norm); + } + + if (verbosity_level > 1) { + if constexpr (std::is_arithmetic_v) { + printf(" discarded_fill = {"); + mdf_print_joined_view(discarded_fill, ", "); + printf("}\n"); + } + printf(" deficiency = {"); + mdf_print_joined_view(deficiency, ", "); + printf("}\n"); + } - range_policy_type stepPolicy(factorization_step, Atmp.numRows()); ordinal_type selected_row_idx = 0; - KokkosSparse::Impl::MDF_select_row MDF_row_selector( - factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, - handle.permutation); - Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, - selected_row_idx); - - KokkosSparse::Impl::MDF_compute_list_length - compute_list_length(selected_row_idx, Atmp, At, handle.permutation, - factored, update_list_length, update_list); - Kokkos::parallel_for("MDF: compute update list", range_policy_type(0, 1), - compute_list_length); - - KokkosSparse::Impl::MDF_factorize_row factorize_row( - Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, - handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, verbosity_level); - Kokkos::parallel_for("MDF: factorize row", range_policy_type(0, 1), - factorize_row); + { + range_policy_type stepPolicy(factorization_step, Atmp.numRows()); + KokkosSparse::Impl::MDF_select_row MDF_row_selector( + factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, + handle.permutation); + Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, + selected_row_idx); + } + + ordinal_type selected_row_len = 0; + { + // vector overloads required for scans to use vector parallel not yet + // provided by kokkos (https://github.com/kokkos/kokkos/issues/6259) + team_range_policy_type updateListPolicy(1, Kokkos::AUTO); + KokkosSparse::Impl::MDF_compute_list_length updateList( + Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, + handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, + handle.permutation_inv, permutation_set, discarded_fill, factored, + selected_row_idx, factorization_step, update_list, verbosity_level); + update_list_len = 0; + Kokkos::parallel_reduce("MDF: compute update list", updateListPolicy, + updateList, update_list_len, selected_row_len); + } + if (verbosity_level > 1) { + printf(" updateList = {"); + mdf_print_joined_view(update_list, ", ", update_list_len); + printf("}\n permutation = {"); + mdf_print_joined_view(handle.permutation, ", "); + printf("}\n permutation_inv = {"); + mdf_print_joined_view(handle.permutation_inv, ", "); + printf("}\n"); + } if (verbosity_level > 0) { - printf("\n"); + printf( + " Selected row idx %d with length %d. Requires update of %d fill " + "norms.\n", + static_cast(selected_row_idx), + static_cast(selected_row_len), + static_cast(update_list_len)); + } + + // If this was the last row no need to update A and At! + if (factorization_step < A.numRows() - 1) { + team_range_policy_type factorizePolicy(selected_row_len, Kokkos::AUTO, + Kokkos::AUTO); + KokkosSparse::Impl::MDF_factorize_row factorize_row( + Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, + handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, + handle.permutation_inv, permutation_set, discarded_fill, factored, + selected_row_idx, factorization_step, update_list, verbosity_level); + Kokkos::parallel_for("MDF: factorize row", factorizePolicy, + factorize_row); } } // Loop over factorization steps diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 03fd660b95..c6005bee12 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -58,7 +58,7 @@ struct MDF_handle { // elimination during the factorization. col_ind_type permutation, permutation_inv; - int verbosity; + int verbosity = 0; crs_matrix_type L, U; diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 3ffe44ffca..5ea4b3c436 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -78,7 +78,13 @@ class PAR_ILUTHandle { /// iteration to iteration drops below /// this, the algorithm will stop (even if /// max_iters has not been hit) - float_t fill_in_limit; /// The threshold for the ILU factorization + float_t fill_in_limit; /// The threshold for removing candidates + /// from the intermediate L and U is set such + /// that the resulting sparsity pattern has + /// at most `fill_in_limit` times the number + /// of non-zeros of the ILU(0) + /// factorization. This selection is executed + /// separately for both factors L and U. bool async_update; /// Whether compute LU factors should do asychronous /// updates. When ON, the algorithm will usually converge /// faster but it makes the algorithm non-deterministic. diff --git a/sparse/src/KokkosSparse_spgemm.hpp b/sparse/src/KokkosSparse_spgemm.hpp index 882dfd5ec2..b2737a9e2c 100644 --- a/sparse/src/KokkosSparse_spgemm.hpp +++ b/sparse/src/KokkosSparse_spgemm.hpp @@ -167,7 +167,7 @@ template void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { auto blockDim = A.blockDim(); - if (blockDim != B.blockDim() or blockDim != C.blockDim()) { + if (blockDim != B.blockDim() || blockDim != C.blockDim()) { throw std::invalid_argument( "Block SpGEMM must be called for matrices with the same block size"); } diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 1106d300c8..a95c828c96 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -661,7 +661,7 @@ class SPGEMMHandle { #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) { + if (std::is_same::value) { this->algorithm_type = SPGEMM_KK; #ifdef VERBOSE std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 60fb5331cf..bd038813d1 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -40,43 +40,81 @@ struct RANK_ONE {}; struct RANK_TWO {}; } // namespace -/// \brief Tag-dispatch for \c Kokkos sparse matrix-vector multiply on single -/// vector +/// \brief Kokkos sparse matrix-vector multiply on single +/// vectors (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). /// +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-1 Kokkos::View and its rank must match that of XVector /// -/// \tparam AMatrix A KokkosSparse::CrsMatrix, or KokkosSparse::BsrMatrix -/// +/// \param space [in] The execution space instance on which to run the +/// kernel. /// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] -/// \param alpha [in] Scalar multiplier for the matrix A. -/// \param A [in] The sparse matrix A. -/// \param x [in] A vector. -/// \param beta [in] Scalar multiplier for the multivector y. -/// \param y [in/out] vector. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. /// \param tag RANK_ONE dispatch -/// -#ifdef DOXY // documentation version -template +#ifdef DOXY // documentation version - don't separately document SFINAE + // specializations for BSR and CRS +template #else -template ::value>::type* = nullptr> #endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, [[maybe_unused]] const RANK_ONE& tag) { - // Make sure that x and y have the same rank. + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); + +// Make sure that x and y have the same rank. +// Make sure that x (and therefore y) is rank 1. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), + "KokkosSparse::spmv: Vector ranks do not match."); + + static_assert(XVector::rank() == 1, + "KokkosSparse::spmv: Both Vector inputs must have rank 1 " + "in order to call this specialization of spmv."); +#else static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that x (and therefore y) is rank 1. static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " "in order to call this specialization of spmv."); +#endif // Make sure that y is non-const. static_assert(std::is_same::value, @@ -136,45 +174,51 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y_i, beta, y_i); + KokkosBlas::scal(space, y_i, beta, y_i); return; } // Whether to call KokkosKernel's native implementation, even if a TPL impl is // available bool useFallback = controls.isParameter("algorithm") && - controls.getParameter("algorithm") == "native"; + (controls.getParameter("algorithm") != "tpl"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only - // supports the normal (N) mode. - if (std::is_same::value || - std::is_same::value) { -#if (9000 <= CUDA_VERSION) - useFallback = useFallback || (mode[0] != NoTranspose[0]); -#endif -#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + // cuSPARSE does not support the conjugate mode (C) + if constexpr (std::is_same_v || + std::is_same_v) { useFallback = useFallback || (mode[0] == Conjugate[0]); -#endif } + // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for all + // scalar types +#if defined(CUSPARSE_VER_MAJOR) && (CUSPARSE_VER_MAJOR == 12) + uintptr_t yptr = uintptr_t((void*)y.data()); + if (yptr % 16 != 0) useFallback = true; +#endif #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE if (std::is_same::value) { + Kokkos::HIPSpace>::value) { useFallback = useFallback || (mode[0] != NoTranspose[0]); } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { + if (std::is_same_v) { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } +#ifdef KOKKOS_ENABLE_SYCL + if (std::is_same_v) { useFallback = useFallback || (mode[0] == Conjugate[0]); } +#endif #endif if (useFallback) { @@ -185,62 +229,91 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix_Internal::non_const_value_type>::name() + "]"; Kokkos::Profiling::pushRegion(label); - Impl::SPMV::spmv(controls, - mode, - alpha, - A_i, x_i, - beta, - y_i); + Impl::SPMV::spmv(space, controls, mode, alpha, A_i, + x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { // note: the cuSPARSE spmv wrapper defines a profiling region, so one is not // needed here. - Impl::SPMV::spmv(controls, mode, - alpha, A_i, x_i, - beta, y_i); + Impl::SPMV::spmv(space, controls, mode, alpha, A_i, x_i, + beta, y_i); } } -#ifdef DOXY // hide SFINAE +/// \brief Kokkos sparse matrix-vector multiply on single +/// vector (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). +/// +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-1 Kokkos::View and its rank must match that of XVector +/// +/// \param controls [in] kokkos-kernels control structure. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +/// \param tag RANK_ONE dispatch +#ifdef DOXY // documentation version template #else template ::value>::type* = nullptr> + typename std::enable_if< + KokkosSparse::is_crs_matrix::value>::type* = nullptr> #endif void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE) { + const BetaType& beta, const YVector& y, const RANK_ONE& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} + +#ifndef DOXY // hide SFINAE specialization for BSR +template ::value>::type* = nullptr> +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_ONE& tag) { + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), + "KokkosSparse::spmv: Vector ranks do not match."); +#else static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), "KokkosSparse::spmv: Vector ranks do not match."); +#endif // Make sure that x (and therefore y) is rank 1. static_assert(static_cast(XVector::rank) == 1, "KokkosSparse::spmv: Both Vector inputs must have rank 1 " @@ -257,7 +330,8 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix::device_type, Kokkos::MemoryTraits, typename AMatrix::size_type> Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_ONE()); + KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, + RANK_ONE()); return; } // Check compatibility of dimensions at run time. @@ -321,9 +395,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y_i, beta, y_i); + KokkosBlas::scal(space, y_i, beta, y_i); return; } @@ -331,7 +405,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // Whether to call KokkosKernel's native implementation, even if a TPL impl is // available bool useFallback = controls.isParameter("algorithm") && - controls.getParameter("algorithm") == "native"; + (controls.getParameter("algorithm") != "tpl"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) @@ -366,66 +440,58 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix_Internal::non_const_value_type>::name() + "]"; Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_BSRMATRIX< - typename AMatrix_Internal::const_value_type, - typename AMatrix_Internal::const_ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::const_size_type, - typename XVector_Internal::const_value_type*, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type*, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, - false>::spmv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + Experimental::Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, controls, + mode, alpha, A_i, + x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { -#define __SPMV_TYPES__ \ - typename AMatrix_Internal::const_value_type, \ - typename AMatrix_Internal::const_ordinal_type, \ - typename AMatrix_Internal::device_type, \ - typename AMatrix_Internal::memory_traits, \ - typename AMatrix_Internal::const_size_type, \ - typename XVector_Internal::const_value_type*, \ - typename XVector_Internal::array_layout, \ - typename XVector_Internal::device_type, \ - typename XVector_Internal::memory_traits, \ - typename YVector_Internal::value_type*, \ - typename YVector_Internal::array_layout, \ - typename YVector_Internal::device_type, \ - typename YVector_Internal::memory_traits - constexpr bool tpl_spec_avail = KokkosSparse::Experimental::Impl::spmv_bsrmatrix_tpl_spec_avail< - __SPMV_TYPES__>::value; + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::value; constexpr bool eti_spec_avail = tpl_spec_avail ? KOKKOSKERNELS_IMPL_COMPILE_LIBRARY /* force FALSE in app/test */ : KokkosSparse::Experimental::Impl::spmv_bsrmatrix_eti_spec_avail< - __SPMV_TYPES__>::value; - - Experimental::Impl::SPMV_BSRMATRIX<__SPMV_TYPES__, tpl_spec_avail, - eti_spec_avail>::spmv_bsrmatrix(controls, - mode, - alpha, - A_i, x_i, - beta, - y_i); + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::value; -#undef __SPMV_TYPES__ + Experimental::Impl::SPMV_BSRMATRIX< + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, + tpl_spec_avail, eti_spec_avail>::spmv_bsrmatrix(space, controls, mode, + alpha, A_i, x_i, beta, + y_i); } } +template ::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} +#endif // ifndef DOXY + +namespace Impl { template struct SPMV2D1D { static bool spmv2d1d(const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y); + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y); }; #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) @@ -436,10 +502,22 @@ struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); return true; } +}; + #else + template struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } }; +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY) template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + #else + template struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } }; +#endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY) template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); return true; } +}; + #else + template struct SPMV2D1D + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } }; +#endif +} // namespace Impl -/// \brief Tag-dispatch sparse matrix-vector multiply on multivectors +template +using SPMV2D1D + [[deprecated("KokkosSparse::SPMV2D1D is not part of the public interface - " + "use KokkosSparse::spmv instead")]] = + Impl::SPMV2D1D; + +/// \brief Kokkos sparse matrix-vector multiply on multivectors +/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). /// -/// \tparam AMatrix A KokkosSparse::CrsMatrix, -/// KokkosSparse::Experimental::BsrMatrix +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector /// +/// \param space [in] The execution space instance on which to run the +/// kernel. /// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] \c "N" for no transpose -/// \param alpha [in] Scalar multiplier for the matrix A. -/// \param A [in] The sparse matrix A. -/// \param x [in] A multivector (rank-2 Kokkos::View). -/// \param beta [in] Scalar multiplier for the multivector y. -/// \param y [in/out] multivector (exrank-2 Kokkos::View). +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. /// \param tag RANK_TWO dispatch -/// -#ifdef DOXY -template +#ifdef DOXY // documentation version +template #else -template ::value>::type* = nullptr> #endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, [[maybe_unused]] const RANK_TWO& tag) { - - // Make sure that x and y have the same rank. + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); +// Make sure that x and y have the same rank. +#if (KOKKOS_VERSION >= 40100) + static_assert(XVector::rank() == YVector::rank(), + "KokkosSparse::spmv: Vector ranks do not match."); +#else static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), "KokkosSparse::spmv: Vector ranks do not match."); +#endif // Make sure that x (and therefore y) is rank 2. static_assert(static_cast(XVector::rank) == 2, "KokkosSparse::spmv: Both Vector inputs must have rank 2 " @@ -592,10 +758,11 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0); // spmv (mode, alpha, A, x_i, beta, y_i); - using impl_type = SPMV2D1D; - if (impl_type::spmv2d1d(mode, alpha, A, x_i, beta, y_i)) { + using impl_type = + Impl::SPMV2D1D; + if (impl_type::spmv2d1d(space, mode, alpha, A, x_i, beta, y_i)) { return; } } @@ -622,58 +789,85 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], useNative = useNative || (Conjugate[0] == mode[0]); #endif useNative = useNative || (controls.isParameter("algorithm") && - (controls.getParameter("algorithm") == "native")); + (controls.getParameter("algorithm") != "tpl")); if (useNative) { return Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral::value, - false>::spmv_mv(controls, mode, alpha, A_i, x_i, beta, y_i); + false>::spmv_mv(space, controls, mode, alpha, A_i, x_i, beta, y_i); } else { - return Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv(controls, mode, - alpha, A_i, x_i, - beta, y_i); + return Impl::SPMV_MV::spmv_mv(space, controls, mode, + alpha, A_i, x_i, beta, + y_i); } } } -#ifdef DOXY // hide SFINAE +/// \brief Kokkos sparse matrix-vector multiply on multivectors +/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +/// controlled by mode (see below). +/// +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector +/// +/// \param controls [in] kokkos-kernels control structure. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +/// \param tag RANK_TWO dispatch +#ifdef DOXY template #else template ::value>::type* = nullptr> + typename std::enable_if< + KokkosSparse::is_crs_matrix::value>::type* = nullptr> #endif void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO) { + const BetaType& beta, const YVector& y, const RANK_TWO& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} + +#ifndef DOXY // hide SFINAE +template ::value>::type* = nullptr> +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_TWO& tag) { + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -694,7 +888,8 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename AMatrix::device_type, Kokkos::MemoryTraits, typename AMatrix::size_type> Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_TWO()); + KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, + RANK_TWO()); return; } // Check compatibility of dimensions at run time. @@ -757,9 +952,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y_i, beta, y_i); + KokkosBlas::scal(space, y_i, beta, y_i); return; } // @@ -781,15 +976,13 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0); YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0); - return spmv(controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); + return spmv(space, controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); } // // Whether to call KokkosKernel's native implementation, even if a TPL impl is // available - bool useFallback = - controls.isParameter("algorithm") && - (controls.getParameter("algorithm") == "native" || - controls.getParameter("algorithm") == "experimental_bsr_tc"); + bool useFallback = controls.isParameter("algorithm") && + (controls.getParameter("algorithm") != "tpl"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) @@ -817,45 +1010,34 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], "]"; Kokkos::Profiling::pushRegion(label); Experimental::Impl::SPMV_MV_BSRMATRIX< - typename AMatrix_Internal::const_value_type, - typename AMatrix_Internal::const_ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::const_size_type, - typename XVector_Internal::const_value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral::value, - false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + false>::spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, + y_i); Kokkos::Profiling::popRegion(); } else { Experimental::Impl::SPMV_MV_BSRMATRIX< - typename AMatrix_Internal::const_value_type, - typename AMatrix_Internal::const_ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::const_size_type, - typename XVector_Internal::const_value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits, + ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, std::is_integral::value>:: - spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, y_i); } } +template ::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO& tag) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y, tag); +} +#endif + /// \brief Public interface to local sparse matrix-vector multiply. /// -/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both +/// Compute y := beta*y + alpha*Op(A)*x, where x and y are either both /// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View /// instances, and Op(A) is determined /// by \c mode. If beta == 0, ignore and overwrite the initial @@ -878,26 +1060,51 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], /// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a /// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. /// -/// \tparam AMatrix KokkosSparse::CrsMatrix or -/// KokkosSparse::Experimental::BsrMatrix +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank 1 or 2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank 1 or 2 Kokkos::View and its rank must match that of XVector /// +/// \param space [in] The execution space instance on which to run the +/// kernel. /// \param controls [in] kokkos-kernels control structure -/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" -/// for conjugate transpose. -/// \param alpha [in] Scalar multiplier for the matrix A. -/// \param A [in] The sparse matrix A. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. /// \param x [in] Either a single vector (rank-1 Kokkos::View) or /// multivector (rank-2 Kokkos::View). /// \param beta [in] Scalar multiplier for the (multi)vector y. /// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or /// multivector (rank-2 Kokkos::View). It must have the same number /// of columns as x. -/// -template -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], +template +void spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y) { + // Make sure that x and y are Views. + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosSparse::spmv: YVector must be a Kokkos::View."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that both x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -945,22 +1152,71 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(y, beta, y); + KokkosBlas::scal(space, y, beta, y); return; } // using RANK_SPECIALISE = typename std::conditional(XVector::rank) == 2, RANK_TWO, RANK_ONE>::type; - spmv(controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE()); + spmv(space, controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE()); +} + +/// \brief Public interface to local sparse matrix-vector multiply. +/// +/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both +/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View +/// instances, and Op(A) is determined +/// by \c mode. If beta == 0, ignore and overwrite the initial +/// entries of y; if alpha == 0, ignore the entries of A and x. +/// +/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have +/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on +/// Volta or Ampere architectures. On Volta-architecture GPUs the only available +/// precision is mixed-precision fp32 accumulator from fp16 inputs. On +/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, +/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller +/// may override this by setting the \c "tc_precision" = \c "mixed" or +/// \c "double" as desired. +/// +/// For mixed precision, performance will degrade for blockDim < 16. +/// For double precision, for blockDim < 8. +/// For such cases, consider an alternate SpMV algorithm. +/// +/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are +/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a +/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. +/// +/// \tparam AMatrix KokkosSparse::CrsMatrix or +/// KokkosSparse::Experimental::BsrMatrix +/// +/// \param controls [in] kokkos-kernels control structure +/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" +/// for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. +/// \param x [in] Either a single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). +/// \param beta [in] Scalar multiplier for the (multi)vector y. +/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). It must have the same number +/// of columns as x. +template +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y); } +#ifndef DOXY /// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv /// argument types /// -/// This is a catch-all interfaceace that throws a compile-time error if \c +/// This is a catch-all interface that throws a compile-time error if \c /// AMatrix is not a CrsMatrix, or BsrMatrix /// template ::value && + !KokkosSparse::is_crs_matrix::value>::type* = nullptr> +void spmv(const ExecutionSpace& /* space */, + KokkosKernels::Experimental::Controls /*controls*/, + const char[] /*mode*/, const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + // have to arrange this so that the compiler can't tell this is false until + // instantiation + static_assert(KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value, + "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); +} +#endif // ifndef DOXY + +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode +/// (see below). +/// +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector +/// +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. template void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, @@ -989,18 +1284,66 @@ void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, spmv(controls, mode, alpha, A, x, beta, y); } +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode +/// (see below). +/// +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a +/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. \tparam YVector Type of y, must be a +/// rank-2 Kokkos::View and its rank must match that of XVector +/// +/// \param space [in] The execution space instance on which to run the +/// kernel. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha +/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +template +void spmv(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + KokkosKernels::Experimental::Controls controls; + spmv(space, controls, mode, alpha, A, x, beta, y); +} + namespace Experimental { -template -void spmv_struct(const char mode[], const int stencil_type, +template +void spmv_struct(const ExecutionSpace& space, const char mode[], + const int stencil_type, const Kokkos::View& structure, const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE) { + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_ONE& tag) { // Make sure that both x and y have the same rank. static_assert((int)XVector::rank == (int)YVector::rank, "KokkosSparse::spmv_struct: Vector ranks do not match."); + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: XVector must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: YVector must be accessible from " + "ExecutionSpace"); // Make sure that x (and therefore y) is rank 1. static_assert( (int)XVector::rank == 1, @@ -1062,24 +1405,23 @@ void spmv_struct(const char mode[], const int stencil_type, YVector_Internal y_i = y; return KokkosSparse::Impl::SPMV_STRUCT< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type*, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type*, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_struct(mode, stencil_type, - structure, alpha, - A_i, x_i, beta, - y_i); + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::spmv_struct(space, mode, stencil_type, structure, + alpha, A_i, x_i, beta, y_i); } +template +void spmv_struct(const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE& tag) { + spmv_struct(typename AMatrix::execution_space{}, mode, stencil_type, + structure, alpha, A, x, beta, y, tag); +} + +namespace Impl { template struct SPMV2D1D_STRUCT { @@ -1089,6 +1431,14 @@ struct SPMV2D1D_STRUCT { Kokkos::HostSpace>& structure, const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y); + + template + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y); }; #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) @@ -1106,6 +1456,18 @@ struct SPMV2D1D_STRUCT + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_ONE()); + return true; + } }; #else template + static bool spmv2d1d_struct( + const ExecutionSpace& /* space*/, const char /*mode*/[], + const int /*stencil_type*/, + const Kokkos::View& /*structure*/, + const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } }; #endif @@ -1138,6 +1511,18 @@ struct SPMV2D1D_STRUCT + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_ONE()); + return true; + } }; #else template + static bool spmv2d1d_struct( + const ExecutionSpace /*space*/, const char /*mode*/[], + const int /*stencil_type*/, + const Kokkos::View& /*structure*/, + const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } }; #endif @@ -1170,6 +1566,18 @@ struct SPMV2D1D_STRUCT + static bool spmv2d1d_struct( + const ExecutionSpace& space, const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_ONE()); + return true; + } }; #else template + static bool spmv2d1d_struct( + const ExecutionSpace& /*space*/, const char /*mode*/[], + const int /*stencil_type*/, + const Kokkos::View& /*structure*/, + const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } }; #endif +} // namespace Impl template -void spmv_struct(const char mode[], const int stencil_type, + class YVector, class XLayout = typename XVector::array_layout> +using SPMV2D1D_STRUCT + [[deprecated("KokkosSparse::SPMV2D1D_STRUCT is not part of the public " + "interface - use KokkosSparse::spmv_struct instead")]] = + Impl::SPMV2D1D_STRUCT; + +template +void spmv_struct(const ExecutionSpace& space, const char mode[], + const int stencil_type, const Kokkos::View& structure, const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO) { + const BetaType& beta, const YVector& y, + [[maybe_unused]] const RANK_TWO& tag) { + // Make sure A, x, y are accessible to ExecutionSpace + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: XVector must be accessible from " + "ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::spmv_struct: YVector must be accessible from " + "ExecutionSpace"); // Make sure that both x and y have the same rank. static_assert(XVector::rank == YVector::rank, "KokkosBlas::spmv: Vector ranks do not match."); @@ -1251,11 +1696,11 @@ void spmv_struct(const char mode[], const int stencil_type, YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0); // spmv_struct (mode, alpha, A, x_i, beta, y_i); - if (SPMV2D1D_STRUCT:: - spmv2d1d_struct(mode, stencil_type, structure, alpha, A, x_i, beta, - y_i)) { + if (Impl::SPMV2D1D_STRUCT:: + spmv2d1d_struct(space, mode, stencil_type, structure, alpha, A, x_i, + beta, y_i)) { return; } } @@ -1278,24 +1723,24 @@ void spmv_struct(const char mode[], const int stencil_type, YVector_Internal y_i = y; return KokkosSparse::Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>:: - spmv_mv(KokkosKernels::Experimental::Controls(), mode, alpha, A_i, x_i, - beta, y_i); + ExecutionSpace, AMatrix_Internal, XVector_Internal, + YVector_Internal>::spmv_mv(space, + KokkosKernels::Experimental::Controls(), + mode, alpha, A_i, x_i, beta, y_i); } } +template +void spmv_struct(const char mode[], const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO& tag) { + spmv_struct(typename AMatrix::execution_space{}, mode, stencil_type, + structure, alpha, A, x, beta, y, tag); +} + /// \brief Public interface to structured local sparse matrix-vector multiply. /// /// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both @@ -1332,6 +1777,45 @@ void spmv_struct(const char mode[], const int stencil_type, RANK_SPECIALISE()); } +/// \brief Public interface to structured local sparse matrix-vector multiply. +/// +/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both +/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View +/// instances, A is a KokkosSparse::CrsMatrix, and Op(A) is determined +/// by \c mode. If beta == 0, ignore and overwrite the initial +/// entries of y; if alpha == 0, ignore the entries of A and x. +/// +/// \param space [in] The execution space instance on which to run the +/// kernel. +/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" +/// for conjugate transpose. +/// \param stencil_type +/// \param structure [in] this 1D view stores the # rows in each dimension +/// (i,j,k) +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix; KokkosSparse::CrsMatrix instance. +/// \param x [in] Either a +/// single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). +/// \param beta [in] Scalar multiplier for the (multi)vector y. +/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). It must have the same number +/// of columns as x. +template +void spmv_struct(const ExecutionSpace& space, const char mode[], + const int stencil_type, + const Kokkos::View& structure, + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + typedef + typename std::conditional::type + RANK_SPECIALISE; + spmv_struct(space, mode, stencil_type, structure, alpha, A, x, beta, y, + RANK_SPECIALISE()); +} + } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp index fb55a65420..5c9e843669 100644 --- a/sparse/src/KokkosSparse_spmv_team.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -55,18 +55,32 @@ int KOKKOS_INLINE_FUNCTION team_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " "values: %d, colIndices: %d", (int)values.extent(0), (int)colIndices.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); +#endif return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL @@ -109,18 +123,32 @@ int KOKKOS_INLINE_FUNCTION team_vector_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " "values: %d, colIndices: %d", (int)values.extent(0), (int)colIndices.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); +#endif return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " "x: %d, y: %d, row_ptr: %d", (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#else + Kokkos::printf( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); +#endif return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/sparse/src/KokkosSparse_sptrsv_supernode.hpp b/sparse/src/KokkosSparse_sptrsv_supernode.hpp index 845efabc57..c6e5d406a7 100644 --- a/sparse/src/KokkosSparse_sptrsv_supernode.hpp +++ b/sparse/src/KokkosSparse_sptrsv_supernode.hpp @@ -27,7 +27,7 @@ #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) #include "KokkosBlas3_trmm.hpp" -#include "KokkosBlas_trtri.hpp" +#include "KokkosLapack_trtri.hpp" #include "KokkosBatched_Trtri_Decl.hpp" #include "KokkosBatched_Trtri_Serial_Impl.hpp" @@ -1402,8 +1402,8 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, // If we are running KokkosKernels::trmm on device, // then we need to allocate a workspace on device using trmm_execution_space = typename KernelHandle::HandleExecSpace; - using trmm_memory_space = typename trmm_execution_space::memory_space; - using trmm_view_t = Kokkos::View; + using trmm_memory_space = typename KernelHandle::HandlePersistentMemorySpace; + using trmm_view_t = Kokkos::View; #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // use KokkosBlas::trmm only with CUBLAS (since deep-copy to host throws an // error) @@ -1472,12 +1472,12 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, // call trtri on device auto dViewLjj = Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL()); - KokkosBlas::trtri(&uplo_char, &diag_char, dViewLjj); + KokkosLapack::trtri(&uplo_char, &diag_char, dViewLjj); } else #endif { // call trtri on host - KokkosBlas::trtri(&uplo_char, &diag_char, Ljj); + KokkosLapack::trtri(&uplo_char, &diag_char, Ljj); } #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE time1 += timer.seconds(); diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 1fcfa7132a..b8c545ffe2 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -141,7 +141,7 @@ SPGEMM_SYMBOLIC_AVAIL_MKL_E(Kokkos::Serial) #ifdef KOKKOS_ENABLE_OPENMP SPGEMM_SYMBOLIC_AVAIL_MKL_E(Kokkos::OpenMP) #endif -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index b9c1f6c1dd..07bb0a0f0a 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -25,8 +25,7 @@ namespace KokkosSparse { namespace Experimental { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_bsrmatrix_tpl_spec_avail { enum : bool { value = false }; }; @@ -41,12 +40,15 @@ struct spmv_bsrmatrix_tpl_spec_avail { SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR*, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; @@ -125,17 +127,22 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const MKL_INT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -159,10 +166,9 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #endif // Specialization struct which defines whether a specialization exists -template ::type>::value> + std::is_integral_v> struct spmv_mv_bsrmatrix_tpl_spec_avail { enum : bool { value = false }; }; @@ -173,17 +179,21 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { // These versions of cuSPARSE require the ordinal and offset types to be the // same. For KokkosKernels, this means int/int only. // cuSparse level 3 does not currently support LayoutRight -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ - SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, false> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false> { \ + enum : bool { value = true }; \ }; #if (9000 <= CUDA_VERSION) @@ -221,16 +231,23 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const int, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -257,17 +274,20 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #include "KokkosSparse_Utils_rocsparse.hpp" -#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE( \ - SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ - template <> \ - struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + Kokkos::HIP, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50200 diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index c6136eab3e..97019e4682 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -20,6 +20,7 @@ #include "KokkosKernels_AlwaysFalse.hpp" #include "KokkosKernels_Controls.hpp" #include "KokkosSparse_Utils_mkl.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include @@ -198,149 +199,47 @@ inline void spm_mv_block_impl_mkl( #endif -#if (__INTEL_MKL__ == 2017) - -inline void spmv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, float* y) { - mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, double* y) { - mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); - const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex8* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex8* x_mkl = reinterpret_cast(x); - MKL_Complex8* y_mkl = reinterpret_cast(y); - mkl_cbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex16* alpha_mkl = - reinterpret_cast(&alpha); - const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex16* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex16* x_mkl = reinterpret_cast(x); - MKL_Complex16* y_mkl = reinterpret_cast(y); - mkl_zbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, MKL_INT colx, MKL_INT ldx, - float* y, MKL_INT ldy) { - mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, - Arowptrs, Arowptrs + 1, x, &beta, y); -} - -inline void spm_mv_block_impl_mkl( - char mode, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, - const double* x, MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { - mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, - Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); -} - -inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, MKL_INT colx, - MKL_INT ldx, Kokkos::complex* y, - MKL_INT ldy) { - const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); - const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex8* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex8* x_mkl = reinterpret_cast(x); - MKL_Complex8* y_mkl = reinterpret_cast(y); - mkl_cbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); -} - -inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - MKL_INT colx, MKL_INT ldx, - Kokkos::complex* y, MKL_INT ldy) { - const MKL_Complex16* alpha_mkl = - reinterpret_cast(&alpha); - const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex16* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex16* x_mkl = reinterpret_cast(x); - MKL_Complex16* y_mkl = reinterpret_cast(y); - mkl_zbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, - Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); -} - -#endif - -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV_BSRMATRIX< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - BsrMatrix, MKL_INT const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_bsrmatrix( \ - const KokkosKernels::Experimental::Controls& /*controls*/, \ - const char mode[], const coefficient_type& alpha, const AMatrix& A, \ - const XVector& X, const coefficient_type& beta, const YVector& Y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ - A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), X.data(), \ - Y.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix( \ + const EXECSPACE&, \ + const KokkosKernels::Experimental::Controls& /*controls*/, \ + const char mode[], const coefficient_type& alpha, const AMatrix& A, \ + const XVector& X, const coefficient_type& beta, const YVector& Y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.blockDim(), A.graph.row_map.data(), \ + A.graph.entries.data(), A.values.data(), X.data(), \ + Y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -368,18 +267,23 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + EXECSPACE, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ - using AMatrix = \ - BsrMatrix, MKL_INT const>; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -388,6 +292,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, using coefficient_type = typename YVector::non_const_value_type; \ \ static void spmv_mv_bsrmatrix( \ + const EXECSPACE&, \ const KokkosKernels::Experimental::Controls& /*controls*/, \ const char mode[], const coefficient_type& alpha, const AMatrix& A, \ const XVector& X, const coefficient_type& beta, const YVector& Y) { \ @@ -453,6 +358,7 @@ namespace Impl { template void spmv_block_impl_cusparse( + const Kokkos::Cuda& exec, const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -463,6 +369,8 @@ void spmv_block_impl_cusparse( /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -565,6 +473,7 @@ template < typename YVector::array_layout>::value, bool> = true> void spm_mv_block_impl_cusparse( + const Kokkos::Cuda& exec, const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -575,6 +484,8 @@ void spm_mv_block_impl_cusparse( /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -666,16 +577,21 @@ void spm_mv_block_impl_cusparse( COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = BsrMatrix; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const*, LAYOUT, device_type, \ Kokkos::MemoryTraits>; \ @@ -685,7 +601,8 @@ void spm_mv_block_impl_cusparse( \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + static void spmv_bsrmatrix(const Kokkos::Cuda& exec, \ + const Controls& controls, const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -693,7 +610,7 @@ void spm_mv_block_impl_cusparse( std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y); \ + spmv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -757,16 +674,23 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, ETI_AVAIL) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const**, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, false, true, ETI_AVAIL> { \ + Kokkos::Cuda, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true, ETI_AVAIL> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = BsrMatrix; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -776,7 +700,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_mv_bsrmatrix(const Controls& controls, const char mode[], \ + static void spmv_mv_bsrmatrix(const Kokkos::Cuda& exec, \ + const Controls& controls, const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -784,7 +709,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spm_mv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y); \ + spm_mv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -840,6 +765,7 @@ namespace Impl { template void spmv_block_impl_rocsparse( + const Kokkos::HIP& exec, const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -893,6 +819,8 @@ void spmv_block_impl_rocsparse( "A entries must be contiguous"); rocsparse_handle handle = controls.getRocsparseHandle(); + // resets handle stream to NULL when out of scope + KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(handle, exec); // set the mode rocsparse_operation trans; @@ -914,7 +842,7 @@ void spmv_block_impl_rocsparse( */ // KokkosSparse Bsr matrix blocks are layoutright (row-major) static_assert( - std::is_same_v, + std::is_same_v, "A blocks must be stored layout-right"); rocsparse_direction dir = rocsparse_direction_row; @@ -1010,16 +938,21 @@ void spmv_block_impl_rocsparse( COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + Kokkos::HIP, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = BsrMatrix; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const*, LAYOUT, device_type, \ Kokkos::MemoryTraits>; \ @@ -1029,7 +962,8 @@ void spmv_block_impl_rocsparse( \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + static void spmv_bsrmatrix(const Kokkos::HIP& exec, \ + const Controls& controls, const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -1037,7 +971,7 @@ void spmv_block_impl_rocsparse( std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_rocsparse(controls, mode, alpha, A, x, beta, y); \ + spmv_block_impl_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 529abc82b7..5e33df1fa3 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -21,25 +21,27 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template ::type>::value> + std::is_integral_v> struct spmv_mv_tpl_spec_avail { enum : bool { value = false }; }; -#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ - XL, YL, MEMSPACE) \ - template <> \ - struct spmv_mv_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR**, YL, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ + XL, YL, MEMSPACE) \ + template <> \ + struct spmv_mv_tpl_spec_avail< \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR**, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; /* CUSPARSE_VERSION 10300 and lower seem to have a bug in cusparseSpMM diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 717c62b985..30e0b6e243 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ #define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ +#include + #include "KokkosKernels_Controls.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -89,7 +91,8 @@ cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { } template -void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, +void spmv_mv_cusparse(const Kokkos::Cuda &exec, + const KokkosKernels::Experimental::Controls &controls, const char mode[], typename YVector::non_const_value_type const &alpha, const AMatrix &A, const XVector &x, @@ -108,6 +111,8 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t opA; @@ -116,8 +121,9 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, case 'T': opA = CUSPARSE_OPERATION_TRANSPOSE; break; case 'H': opA = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break; default: { - std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n"; - throw std::invalid_argument("Invalid mode"); + std::ostringstream out; + out << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n"; + throw std::invalid_argument(out.str()); } } @@ -191,39 +197,43 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); } -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV_MV< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const **, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR **, YL, Kokkos::Device, \ - Kokkos::MemoryTraits, false, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const **, XL, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - using Controls = KokkosKernels::Experimental::Controls; \ - static void spmv_mv(const Controls &controls, const char mode[], \ - const coefficient_type &alpha, const AMatrix &A, \ - const XVector &x, const coefficient_type &beta, \ - const YVector &y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_mv_cusparse(controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_MV< \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const **, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const **, XL, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + using Controls = KokkosKernels::Experimental::Controls; \ + static void spmv_mv(const Kokkos::Cuda &exec, const Controls &controls, \ + const char mode[], const coefficient_type &alpha, \ + const AMatrix &A, const XVector &x, \ + const coefficient_type &beta, const YVector &y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; /* cusparseSpMM with following restrictions diff --git a/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp index 707e653803..7b853c953c 100644 --- a/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_struct_tpl_spec_avail.hpp @@ -20,15 +20,13 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_struct_tpl_spec_avail { enum : bool { value = false }; }; // Specialization struct which defines whether a specialization exists -template +template struct spmv_mv_struct_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 060fef45bb..653ec94811 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -24,8 +24,7 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_tpl_spec_avail { enum : bool { value = false }; }; @@ -40,12 +39,15 @@ struct spmv_tpl_spec_avail { YL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET>, \ + Kokkos::View< \ + const SCALAR*, XL, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; @@ -181,22 +183,22 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ - template <> \ - struct spmv_tpl_spec_avail< \ - const SCALAR, const rocsparse_int, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const rocsparse_int, \ - const SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::HIP, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const rocsparse_int>, \ + Kokkos::View< \ + const SCALAR*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(double, Kokkos::LayoutLeft) @@ -215,17 +217,22 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - const SCALAR, const MKL_INT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits > { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -242,6 +249,49 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const ORDINAL>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + float, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + double, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) + +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + float, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + double, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( + Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace) +#endif + #endif // KOKKOSKERNELS_ENABLE_TPL_MKL } // namespace Impl diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index ecbe45c7fd..efb591375b 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ #define KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ +#include + #include "KokkosKernels_Controls.hpp" // cuSPARSE @@ -28,7 +30,8 @@ namespace KokkosSparse { namespace Impl { template -void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, +void spmv_cusparse(const Kokkos::Cuda& exec, + const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, @@ -39,6 +42,8 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, /* initialize cusparse library */ cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + /* Set cuSPARSE to use the given stream until this function exits */ + TemporarySetCusparseStream(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -49,10 +54,16 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, myCusparseOperation = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break; default: { - std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV.\n"; - throw std::invalid_argument("Invalid mode"); + std::ostringstream out; + out << "Mode " << mode << " invalid for cuSPARSE SpMV.\n"; + throw std::invalid_argument(out.str()); } } + // cuSPARSE doesn't directly support mode H with real values, but this is + // equivalent to mode T + if (myCusparseOperation == CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE && + !Kokkos::ArithTraits::isComplex) + myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) @@ -193,39 +204,43 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, #endif // CUDA_VERSION } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv(const Controls& controls, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_cusparse(controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Cuda, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const Kokkos::Cuda& exec, const Controls& controls, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; // BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate @@ -350,7 +365,8 @@ namespace KokkosSparse { namespace Impl { template -void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, +void spmv_rocsparse(const Kokkos::HIP& exec, + const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, @@ -362,6 +378,8 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, /* initialize rocsparse library */ rocsparse_handle handle = controls.getRocsparseHandle(); + /* Set rocsparse to use the given stream until this function exits */ + TemporarySetRocsparseStream(handle, exec); /* Set the operation mode */ rocsparse_operation myRocsparseOperation = mode_kk_to_rocsparse(mode); @@ -450,21 +468,21 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, #define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ template <> \ - struct SPMV, \ - Kokkos::MemoryTraits, rocsparse_int const, \ - SCALAR const*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, \ - SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, true, \ - COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ + struct SPMV< \ + Kokkos::HIP, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + rocsparse_int const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using AMatrix = CrsMatrix; \ @@ -477,14 +495,14 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv(const Controls& controls, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ + static void spmv(const Kokkos::HIP& exec, const Controls& controls, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_rocsparse(controls, mode, alpha, A, x, beta, y); \ + spmv_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -600,113 +618,23 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, beta_mkl, reinterpret_cast(y))); } +// Note: classic MKL runs on Serial/OpenMP but can't use our execution space +// instances #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ - struct SPMV< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - CrsMatrix, MKL_INT const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - static void spmv(const Controls&, const char mode[], \ - const coefficient_type& alpha, const AMatrix& A, \ - const XVector& x, const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ - A.graph.row_map.data(), A.graph.entries.data(), \ - A.values.data(), x.data(), y.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ - }; -#endif - -#if (__INTEL_MKL__ == 2017) -// MKL 2017: use old interface: mkl_?csrmv -inline char mode_kk_to_mkl(char mode_kk) { - switch (toupper(mode_kk)) { - case 'N': return 'N'; - case 'T': return 'T'; - case 'H': return 'C'; - default:; - } - throw std::invalid_argument( - "Invalid mode for MKL (should be one of N, T, H)"); -} - -inline void spmv_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const float* Avalues, const float* x, float* y) { - mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_mkl(char mode, double alpha, double beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const double* Avalues, const double* x, double* y) { - mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, - Arowptrs + 1, x, &beta, y); -} - -inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); - const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex8* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex8* x_mkl = reinterpret_cast(x); - MKL_Complex8* y_mkl = reinterpret_cast(y); - mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, - Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - const MKL_Complex16* alpha_mkl = - reinterpret_cast(&alpha); - const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); - const MKL_Complex16* Avalues_mkl = - reinterpret_cast(Avalues); - const MKL_Complex16* x_mkl = reinterpret_cast(x); - MKL_Complex16* y_mkl = reinterpret_cast(y); - mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, - Arowptrs + 1, x_mkl, beta_mkl, y_mkl); -} - -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - Kokkos::LayoutLeft, Kokkos::Device, \ - Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + struct SPMV, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ CrsMatrix alpha, using coefficient_type = typename YVector::non_const_value_type; \ using Controls = KokkosKernels::Experimental::Controls; \ \ - static void spmv(const Controls&, const char mode[], \ + static void spmv(const EXECSPACE&, const Controls&, const char mode[], \ const coefficient_type& alpha, const AMatrix& A, \ const XVector& x, const coefficient_type& beta, \ const YVector& y) { \ @@ -732,7 +660,6 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::Profiling::popRegion(); \ } \ }; -#endif #ifdef KOKKOS_ENABLE_SERIAL KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -755,6 +682,165 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #endif #undef KOKKOSSPARSE_SPMV_MKL +#endif + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return oneapi::mkl::transpose::nontrans; + case 'T': return oneapi::mkl::transpose::trans; + case 'H': return oneapi::mkl::transpose::conjtrans; + default:; + } + throw std::invalid_argument( + "Invalid mode for oneMKL (should be one of N, T, H)"); +} + +template +struct spmv_onemkl_wrapper {}; + +template <> +struct spmv_onemkl_wrapper { + template + static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, + typename matrix_type::non_const_value_type const alpha, + const matrix_type& A, const xview_type& x, + typename matrix_type::non_const_value_type const beta, + const yview_type& y) { + using scalar_type = typename matrix_type::non_const_value_type; + using ordinal_type = typename matrix_type::non_const_ordinal_type; + + // oneAPI doesn't directly support mode H with real values, but this is + // equivalent to mode T + if (mkl_mode == oneapi::mkl::transpose::conjtrans && + !Kokkos::ArithTraits::isComplex) + mkl_mode = oneapi::mkl::transpose::trans; + + oneapi::mkl::sparse::matrix_handle_t handle = nullptr; + oneapi::mkl::sparse::init_matrix_handle(&handle); + auto ev_set = oneapi::mkl::sparse::set_csr_data( + exec.sycl_queue(), handle, A.numRows(), A.numCols(), + oneapi::mkl::index_base::zero, + const_cast(A.graph.row_map.data()), + const_cast(A.graph.entries.data()), + const_cast(A.values.data())); + auto ev_opt = oneapi::mkl::sparse::optimize_gemv( + exec.sycl_queue(), mkl_mode, handle, {ev_set}); + auto ev_gemv = + oneapi::mkl::sparse::gemv(exec.sycl_queue(), mkl_mode, alpha, handle, + x.data(), beta, y.data(), {ev_opt}); + auto ev_release = oneapi::mkl::sparse::release_matrix_handle( + exec.sycl_queue(), &handle, {ev_gemv}); + ev_release.wait(); + } +}; + +template <> +struct spmv_onemkl_wrapper { + template + static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, + typename matrix_type::non_const_value_type const alpha, + const matrix_type& A, const xview_type& x, + typename matrix_type::non_const_value_type const beta, + const yview_type& y) { + using scalar_type = typename matrix_type::non_const_value_type; + using ordinal_type = typename matrix_type::non_const_ordinal_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + + oneapi::mkl::sparse::matrix_handle_t handle = nullptr; + oneapi::mkl::sparse::init_matrix_handle(&handle); + auto ev_set = oneapi::mkl::sparse::set_csr_data( + exec.sycl_queue(), handle, static_cast(A.numRows()), + static_cast(A.numCols()), oneapi::mkl::index_base::zero, + const_cast(A.graph.row_map.data()), + const_cast(A.graph.entries.data()), + reinterpret_cast*>( + const_cast(A.values.data()))); + auto ev_opt = oneapi::mkl::sparse::optimize_gemv( + exec.sycl_queue(), mkl_mode, handle, {ev_set}); + auto ev_gemv = oneapi::mkl::sparse::gemv( + exec.sycl_queue(), mkl_mode, alpha, handle, + reinterpret_cast*>( + const_cast(x.data())), + beta, reinterpret_cast*>(y.data()), {ev_opt}); + auto ev_release = oneapi::mkl::sparse::release_matrix_handle( + exec.sycl_queue(), &handle, {ev_gemv}); + ev_release.wait(); + } +}; + +#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, ORDINAL const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using device_type = Kokkos::Device; \ + using AMatrix = \ + CrsMatrix, ORDINAL const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv(const execution_space& exec, const Controls&, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ + spmv_onemkl_wrapper::is_complex>::spmv( \ + exec, mkl_mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(double, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(double, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif } // namespace Impl } // namespace KokkosSparse #endif diff --git a/sparse/unit_test/CMakeLists.txt b/sparse/unit_test/CMakeLists.txt index 745df8992f..d591944675 100644 --- a/sparse/unit_test/CMakeLists.txt +++ b/sparse/unit_test/CMakeLists.txt @@ -24,6 +24,14 @@ IF (KOKKOS_ENABLE_CUDA) backends/Test_Cuda_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_HIP) @@ -34,6 +42,14 @@ IF (KOKKOS_ENABLE_HIP) backends/Test_HIP_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_SYCL) @@ -44,6 +60,14 @@ IF (KOKKOS_ENABLE_SYCL) backends/Test_SYCL_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_OPENMPTARGET) @@ -54,6 +78,14 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) # backends/Test_OpenMPTarget_Sparse.cpp # COMPONENTS sparse # ) + + # KOKKOSKERNELS_ADD_UNIT_TEST( + # blocksparse_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_BlockSparse.cpp + # COMPONENTS sparse + # ) ENDIF () @@ -71,6 +103,14 @@ IF (KOKKOS_ENABLE_SERIAL) backends/Test_Serial_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_OPENMP) @@ -81,6 +121,14 @@ IF (KOKKOS_ENABLE_OPENMP) backends/Test_OpenMP_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () IF (KOKKOS_ENABLE_THREADS) @@ -91,5 +139,13 @@ IF (KOKKOS_ENABLE_THREADS) backends/Test_Threads_Sparse.cpp COMPONENTS sparse ) + + KOKKOSKERNELS_ADD_UNIT_TEST( + blocksparse_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_BlockSparse.cpp + COMPONENTS sparse + ) ENDIF () diff --git a/sparse/unit_test/Test_BlockSparse.hpp b/sparse/unit_test/Test_BlockSparse.hpp new file mode 100644 index 0000000000..b0dd87c5ed --- /dev/null +++ b/sparse/unit_test/Test_BlockSparse.hpp @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_BLOCKSPARSE_HPP +#define TEST_BLOCKSPARSE_HPP + +#include "Test_Sparse_block_gauss_seidel.hpp" +#include "Test_Sparse_BsrMatrix.hpp" +#include "Test_Sparse_bspgemm.hpp" +#include "Test_Sparse_spmv_bsr.hpp" + +#endif // TEST_BLOCKSPARSE_HPP diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index e0d0085be1..8ae06b598a 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -20,23 +20,20 @@ #include "Test_Sparse_coo2crs.hpp" #endif // KOKKOS_VERSION >= 40099 #include "Test_Sparse_crs2coo.hpp" -#include "Test_Sparse_block_gauss_seidel.hpp" #include "Test_Sparse_Controls.hpp" #include "Test_Sparse_CrsMatrix.hpp" -#include "Test_Sparse_BsrMatrix.hpp" #include "Test_Sparse_mdf.hpp" #include "Test_Sparse_findRelOffset.hpp" #include "Test_Sparse_gauss_seidel.hpp" +#include "Test_Sparse_MergeMatrix.hpp" #include "Test_Sparse_replaceSumInto.hpp" #include "Test_Sparse_replaceSumIntoLonger.hpp" #include "Test_Sparse_spadd.hpp" #include "Test_Sparse_spgemm_jacobi.hpp" #include "Test_Sparse_spgemm.hpp" -#include "Test_Sparse_bspgemm.hpp" #include "Test_Sparse_SortCrs.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" -#include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" #include "Test_Sparse_par_ilut.hpp" @@ -46,13 +43,13 @@ #include "Test_Sparse_ccs2crs.hpp" #include "Test_Sparse_crs2ccs.hpp" #include "Test_Sparse_removeCrsMatrixZeros.hpp" +#include "Test_Sparse_extractCrsDiagonalBlocks.hpp" // TPL specific tests, these require // particular pairs of backend and TPL // to actually define tests. #include "Test_Sparse_Utils_cusparse.hpp" - #include "Test_Sparse_rocsparse.hpp" #endif // TEST_SPARSE_HPP diff --git a/sparse/unit_test/Test_Sparse_Controls.hpp b/sparse/unit_test/Test_Sparse_Controls.hpp index 7da8e19e97..79679f8173 100644 --- a/sparse/unit_test/Test_Sparse_Controls.hpp +++ b/sparse/unit_test/Test_Sparse_Controls.hpp @@ -38,7 +38,25 @@ void test_controls_set() { EXPECT_EQ(c.getParameter("", "default"), "default"); } +void test_controls_il() { + { + KokkosKernels::Experimental::Controls c({{"key1", "val1"}}); + EXPECT_EQ(c.isParameter("blah"), false); + EXPECT_EQ(c.getParameter("blah"), ""); + EXPECT_EQ(c.getParameter("key1"), "val1"); + } + { + KokkosKernels::Experimental::Controls c( + {{"key1", "val1"}, {"key2", "val2"}}); + EXPECT_EQ(c.isParameter("blah"), false); + EXPECT_EQ(c.getParameter("blah"), ""); + EXPECT_EQ(c.getParameter("key1"), "val1"); + EXPECT_EQ(c.getParameter("key2"), "val2"); + } +} + TEST_F(TestCategory, controls_empty) { test_controls_empty(); } TEST_F(TestCategory, controls_set) { test_controls_set(); } +TEST_F(TestCategory, controls_il) { test_controls_il(); } #endif // TEST_SPARSE_CONTROLS_HPP diff --git a/sparse/unit_test/Test_Sparse_MergeMatrix.hpp b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp new file mode 100644 index 0000000000..85c35c0044 --- /dev/null +++ b/sparse/unit_test/Test_Sparse_MergeMatrix.hpp @@ -0,0 +1,590 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef TEST_COMMON_MERGE_MATRIX_HPP +#define TEST_COMMON_MERGE_MATRIX_HPP + +#include +#include +#include +#include + +#include +#include + +#include "KokkosKernels_Iota.hpp" +#include "KokkosSparse_merge_matrix.hpp" + +namespace Test_Sparse_MergeMatrix { + +template +View from_std_vec(const std::string &label, + const std::vector &vec) { + Kokkos::View + uvec(vec.data(), vec.size()); + View result(label, uvec.size()); + Kokkos::deep_copy(result, uvec); + return result; +} + +template +struct CopyMmdToView { + CopyMmdToView(const View &dst, const MMD &src) : dst_(dst), src_(src) {} + + KOKKOS_INLINE_FUNCTION + void operator()(size_t i) const { dst_(i) = src_(i); } + + private: + View dst_; + MMD src_; +}; + +template +void expect_mmd_entries( + const MMD &mmd, + const std::vector &expected) { + using execution_space = typename MMD::execution_space; + using Policy = Kokkos::RangePolicy; + using View = + Kokkos::View; + + // size is as expected + EXPECT_EQ(mmd.size(), expected.size()); + + // values are as expected + View view("mmd-values", mmd.size()); + execution_space space; + Kokkos::parallel_for(Policy(space, 0, mmd.size()), CopyMmdToView(view, mmd)); + auto host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); + space.fence(); + for (size_t i = 0; i < host.size(); ++i) { + EXPECT_EQ(host(i), expected[i]); + } +} + +/*! \brief merge-matrix of two empty views + + Matrix is 0x0. + Only diagonal 0 exists, and it should be size 0. +*/ +template +void view_view_empty_empty() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + + AView a("view-view-empty-empty-a", 0); + BView b("view-view-empty-empty-b", 0); + expect_mmd_entries(MMD(a, b, 0), {}); +} + +/*! \brief merge-matrix of one empty view + + Matrix is Nx0. + N diagonals exist, all length 0 +*/ +template +void view_view_full_empty() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + + size_t aNonzero = 5; + AView a("view-view-full-empty-a", aNonzero); + BView b("view-view-full-empty-b", 0); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + expect_mmd_entries(MMD(a, b, diagonal), {}); + } +} + +/*! \brief merge-matrix of one empty view + + Matrix is 0xN. + N diagonals exist, all length 0 +*/ +template +void view_view_empty_full() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + + AView a("view-view-empty-full-a", 0); + BView b = from_std_vec("view-view-empty-full-b", {0, 1, 2, 3}); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + expect_mmd_entries(MMD(a, b, diagonal), {}); + } +} + +template +std::tuple view_view_case_all_zero() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + AView a = from_std_vec("view-view-case-all-zero-a", {0, 0, 0, 0}); + BView b = from_std_vec("view-view-case-all-zero-b", {0, 1, 2, 3}); + + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_all_one() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 0 0 0 + // A ------- + // 1 | 1 1 1 1 + // 2 | 1 1 1 1 + // 3 | 1 1 1 1 + // 4 | 1 1 1 1 + AView a = from_std_vec("view-view-case-all-one-a", {1, 2, 3, 4}); + BView b = from_std_vec("view-view-case-all-one-b", {0, 0, 0, 0}); + + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_1() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 1 | 1 0 0 0 + // 2 | 1 1 0 0 + // 3 | 1 1 1 0 + // 4 | 1 1 1 1 + AView a = from_std_vec("view-view-case-1-a", {1, 2, 3, 4}); + BView b = from_std_vec("view-view-case-1-b", {0, 1, 2, 3}); + + // diagonal 0: {} + // 1: {1} + // 2: {1,0} + // 3: {1,1,0} + // 4: {1,1,0,0} + // 5: {1,1,0} + // 6: {1,0} + // 7: {1} + + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_2() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 2 2 8 8 8 + // A ----------- + // 1 | 1 0 0 0 0 0 + // 2 | 1 0 0 0 0 0 + // 9 | 1 1 1 1 1 1 + AView a = from_std_vec("view-view-case-2-a", {1, 2, 9}); + BView b = from_std_vec("view-view-case-2-b", {0, 2, 2, 8, 8, 8}); + // 0: {} + // 1: {1} + // 2: {1,0} + // 3: {1,0,0} + // 4: {1,0,0} + // 5: {1,0,0} + // 6: {1,0,0} + // 7: {1,0} + // 8: {1} + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_3() { + using AEntry = typename AView::non_const_value_type; + // M[i,j] = 1 iff A[i] > B[j] + // B 0 2 7 + // A ----- + // -1 | 0 0 0 + // 9 | 1 1 1 + // 9 | 1 1 1 + AView a = from_std_vec("view-view-case-3-a", + {AEntry(-1), AEntry(9), AEntry(9)}); + BView b = from_std_vec("view-view-case-3-b", {0, 2, 7}); + // 0: {} + // 1: {0} + // 2: {1,0} + // 3: {1,1,0} + // 4: {1,1} + // 5: {1} + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_4() { + using BEntry = typename BView::non_const_value_type; + + // M[i,j] = 1 iff A[i] > B[j] + // B -3 -1 7 + // A ------- + // 1 | 1 1 0 + // 6 | 1 1 0 + // 6 | 1 1 0 + AView a = from_std_vec("view-view-case-4-a", {1, 6, 6}); + BView b = + from_std_vec("view-view-case-4-b", {BEntry(-3), BEntry(-1), 7}); + // 0: {} + // 1: {1} + // 2: {1,1} + // 3: {1,1,0} + // 4: {1,0} + // 5: {0} + return std::make_tuple(a, b); +} + +template +std::tuple view_view_case_5() { + using AEntry = typename AView::non_const_value_type; + using BEntry = typename BView::non_const_value_type; + + // M[i,j] = 1 iff A[i] > B[j] + // B -2 0 1 + // A ------- + // -3 | 0 0 0 + // -2 | 0 0 0 + // 2 | 1 1 1 + AView a = from_std_vec("view-view-case-5-a", + {AEntry{-3}, AEntry{-2}, AEntry{2}}); + BView b = from_std_vec("view-view-case-5-b", + {BEntry{-2}, BEntry{0}, BEntry{1}}); + // 0: {} + // 1: {0} + // 2: {0,0} + // 3: {0,0,0} + // 4: {1,0} + // 5: {1} + return std::make_tuple(a, b); +} + +/*! \brief merge-matrix of two views + + Matrix is MxN. + M+N-1 diagonals exist. +*/ +template +void view_view_full_full() { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + using mmd_value_type = typename MMD::non_const_value_type; + + { + auto [a, b] = view_view_case_all_zero(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 0 + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(0))); + } + } + { + auto [a, b] = view_view_case_all_one(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 0 + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(1))); + } + } + { + auto [a, b] = view_view_case_1(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 1, 0, 0}); + expect_mmd_entries(MMD(a, b, 5), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 6), {1, 0}); + expect_mmd_entries(MMD(a, b, 7), {1}); + } + { + auto [a, b] = view_view_case_2(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 5), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 6), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 7), {1, 0}); + expect_mmd_entries(MMD(a, b, 8), {1}); + } + if constexpr (std::is_signed_v) { + auto [a, b] = view_view_case_3(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {0}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 1}); + expect_mmd_entries(MMD(a, b, 5), {1}); + } + if constexpr (std::is_signed_v) { + auto [a, b] = view_view_case_4(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 1}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 0}); + expect_mmd_entries(MMD(a, b, 5), {0}); + } + if constexpr (std::is_signed_v && std::is_signed_v) { + auto [a, b] = view_view_case_5(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {0}); + expect_mmd_entries(MMD(a, b, 2), {0, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 0, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 0}); + expect_mmd_entries(MMD(a, b, 5), {1}); + } +} + +template +void test_view_view() { + view_view_empty_empty(); + view_view_full_empty(); + view_view_empty_full(); + view_view_full_full(); +} + +/*! \brief merge-matrix of an empty view and empty iota + + Matrix is 0x0. + Only diagonal 0 exists, and it should be size 0. +*/ +template +void view_iota_empty_empty() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + + AView a("view-iota-empty-empty-a", 0); + BView b(0); + EXPECT_EQ(MMD(a, b, 0).size(), 0); +} + +/*! \brief merge-matrix of a full view and empty iota + + Matrix is Nx0. + N diagonals exist, all length 0 +*/ +template +void view_iota_full_empty() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + + size_t aNonzero = 5; + AView a("view-iota-full-empty-a", aNonzero); + BView b(0); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + EXPECT_EQ(MMD(a, b, diagonal).size(), 0); + } +} + +/*! \brief merge-matrix of and empty view and a full iota + + Matrix is 0xN. + N diagonals exist, all length 0 +*/ +template +void view_iota_empty_full() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + + AView a("view-iota-empty-full-a", 0); + BView b(4); + + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + EXPECT_EQ(MMD(a, b, diagonal).size(), 0); + } +} + +template +std::tuple view_iota_case_all_zero() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + // 0 | 0 0 0 0 + AView a = from_std_vec("view-iota-case-all-zero-a", {0, 0, 0, 0}); + BView b(4); + + return std::make_tuple(a, b); +} + +template +std::tuple view_iota_case_all_one() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 5 | 1 1 1 1 + // 6 | 1 1 1 1 + // 7 | 1 1 1 1 + // 8 | 1 1 1 1 + AView a = from_std_vec("view-iota-case-all-one-a", {5, 6, 7, 8}); + BView b(4); + + return std::make_tuple(a, b); +} + +template +std::tuple view_iota_case_1() { + // M[i,j] = 1 iff A[i] > B[j] + // B 0 1 2 3 + // A ------- + // 1 | 1 0 0 0 + // 2 | 1 1 0 0 + // 3 | 1 1 1 0 + // 4 | 1 1 1 1 + AView a = from_std_vec("view-iota-case-1-a", {1, 2, 3, 4}); + BView b(4); + + // diagonal 0: {} + // 1: {1} + // 2: {1,0} + // 3: {1,1,0} + // 4: {1,1,0,0} + // 5: {1,1,0} + // 6: {1,0} + // 7: {1} + + return std::make_tuple(a, b); +} + +/*! \brief merge-matrix of a full view with a full iota + + Matrix is MxN. + M+N-1 diagonals exist. +*/ +template +void view_iota_full_full() { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + using mmd_value_type = typename MMD::non_const_value_type; + + { + auto [a, b] = view_iota_case_all_zero(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 0 + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(0))); + } + } + { + auto [a, b] = view_iota_case_all_one(); + for (size_t diagonal = 0; diagonal < a.size() + b.size() - 1; ++diagonal) { + MMD mmd(a, b, diagonal); + // every matrix entry on this diagonal is 1 + expect_mmd_entries( + mmd, std::vector(mmd.size(), mmd_value_type(1))); + } + } + { + auto [a, b] = view_iota_case_1(); + expect_mmd_entries(MMD(a, b, 0), {}); + expect_mmd_entries(MMD(a, b, 1), {1}); + expect_mmd_entries(MMD(a, b, 2), {1, 0}); + expect_mmd_entries(MMD(a, b, 3), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 4), {1, 1, 0, 0}); + expect_mmd_entries(MMD(a, b, 5), {1, 1, 0}); + expect_mmd_entries(MMD(a, b, 6), {1, 0}); + expect_mmd_entries(MMD(a, b, 7), {1}); + } +} + +template +void test_view_iota() { + view_iota_empty_empty(); + view_iota_full_empty(); + view_iota_empty_full(); + view_iota_full_full(); +} + +template +void test_rank() { + { + using AView = Kokkos::View; + using BView = Kokkos::View; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + static_assert(MMD::rank == 1, + "MergeMatrixDiagonal should look like a rank-1 view"); + } + + { + using AView = Kokkos::View; + using BView = KokkosKernels::Impl::Iota; + using MMD = KokkosSparse::Impl::MergeMatrixDiagonal; + static_assert(MMD::rank == 1, + "MergeMatrixDiagonal should look like a rank-1 view"); + } +} + +template +void test_merge_matrix() { + test_rank(); + test_view_view(); + test_view_iota(); +} + +} // namespace Test_Sparse_MergeMatrix + +TEST_F(TestCategory, common_merge_matrix) { + // clang-format off + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + // test some select integer / float combos + Test_Sparse_MergeMatrix::test_merge_matrix(); + Test_Sparse_MergeMatrix::test_merge_matrix(); + + // no generally safe way to compare all possible values of these types + // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); + // Test_Sparse_MergeMatrix::test_merge_matrix(); + + // clang-format on +} + +#endif // TEST_COMMON_MERGE_MATRIX_HPP diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 63c977ca9a..c06509b3ec 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -41,15 +41,14 @@ enum : int { }; } -template +template void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, bool doStructInterface, int howExecSpecified) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; // Create a random matrix on device @@ -160,14 +159,13 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, } } -template +template void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { // This test is about bug #960. - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix, @@ -207,14 +205,13 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { } } -template +template void testSortAndMerge(bool justGraph, int howExecSpecified, - bool doStructInterface, int testCase) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + bool doStructInterface, bool inPlace, int testCase) { + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; using graph_t = typename crsMat_t::staticcrsgraph_type; @@ -361,21 +358,49 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, } else { rowmap_t devOutRowmap; entries_t devOutEntries; + if (inPlace) { + // Start out with the output views containing the input, so that + // sort/merge is done in-place + devOutRowmap = rowmap_t("devOutRowmap", input.graph.row_map.extent(0)); + devOutEntries = + entries_t("devOutEntries", input.graph.entries.extent(0)); + Kokkos::deep_copy(devOutRowmap, input.graph.row_map); + Kokkos::deep_copy(devOutEntries, input.graph.entries); + } switch (howExecSpecified) { - case SortCrsTest::Instance: - KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, - input.graph.entries, devOutRowmap, - devOutEntries); + case SortCrsTest::Instance: { + if (inPlace) { + KokkosSparse::sort_and_merge_graph(exec_space(), devOutRowmap, + devOutEntries, devOutRowmap, + devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph( + exec_space(), input.graph.row_map, input.graph.entries, + devOutRowmap, devOutEntries); + } break; - case SortCrsTest::ExplicitType: - KokkosSparse::sort_and_merge_graph( - input.graph.row_map, input.graph.entries, devOutRowmap, - devOutEntries); + } + case SortCrsTest::ExplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_graph( + devOutRowmap, devOutEntries, devOutRowmap, devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph( + input.graph.row_map, input.graph.entries, devOutRowmap, + devOutEntries); + } break; - case SortCrsTest::ImplicitType: - KokkosSparse::sort_and_merge_graph(input.graph.row_map, - input.graph.entries, devOutRowmap, - devOutEntries); + } + case SortCrsTest::ImplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_graph(devOutRowmap, devOutEntries, + devOutRowmap, devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph(input.graph.row_map, + input.graph.entries, + devOutRowmap, devOutEntries); + } + } } outputGraph = graph_t(devOutEntries, devOutRowmap); } @@ -397,21 +422,53 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, rowmap_t devOutRowmap; entries_t devOutEntries; values_t devOutValues; + if (inPlace) { + // Start out with the output views containing the input, so that + // sort/merge is done in-place + devOutRowmap = rowmap_t("devOutRowmap", input.graph.row_map.extent(0)); + devOutEntries = + entries_t("devOutEntries", input.graph.entries.extent(0)); + devOutValues = values_t("devOutValues", input.values.extent(0)); + Kokkos::deep_copy(devOutRowmap, input.graph.row_map); + Kokkos::deep_copy(devOutEntries, input.graph.entries); + Kokkos::deep_copy(devOutValues, input.values); + } switch (howExecSpecified) { - case SortCrsTest::Instance: - KokkosSparse::sort_and_merge_matrix( - exec_space(), input.graph.row_map, input.graph.entries, - input.values, devOutRowmap, devOutEntries, devOutValues); + case SortCrsTest::Instance: { + if (inPlace) { + KokkosSparse::sort_and_merge_matrix( + exec_space(), devOutRowmap, devOutEntries, devOutValues, + devOutRowmap, devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + exec_space(), input.graph.row_map, input.graph.entries, + input.values, devOutRowmap, devOutEntries, devOutValues); + } break; - case SortCrsTest::ExplicitType: - KokkosSparse::sort_and_merge_matrix( - input.graph.row_map, input.graph.entries, input.values, - devOutRowmap, devOutEntries, devOutValues); + } + case SortCrsTest::ExplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_matrix( + devOutRowmap, devOutEntries, devOutValues, devOutRowmap, + devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + } break; - case SortCrsTest::ImplicitType: - KokkosSparse::sort_and_merge_matrix( - input.graph.row_map, input.graph.entries, input.values, - devOutRowmap, devOutEntries, devOutValues); + } + case SortCrsTest::ImplicitType: { + if (inPlace) { + KokkosSparse::sort_and_merge_matrix(devOutRowmap, devOutEntries, + devOutValues, devOutRowmap, + devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + } + } } // and then construct output from views output = crsMat_t("Output", nrows, ncols, devOutValues.extent(0), @@ -449,14 +506,14 @@ TEST_F(TestCategory, common_sort_crsgraph) { // because the exec space type is determined from the graph. if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; - testSortCRS(10, 10, 20, false, doStructInterface, - howExecSpecified); - testSortCRS(100, 100, 2000, false, doStructInterface, - howExecSpecified); - testSortCRS(1000, 1000, 30000, false, doStructInterface, - howExecSpecified); + testSortCRS(10, 10, 20, false, doStructInterface, + howExecSpecified); + testSortCRS(100, 100, 2000, false, doStructInterface, + howExecSpecified); + testSortCRS(1000, 1000, 30000, false, doStructInterface, + howExecSpecified); } - testSortCRSUnmanaged(false, doStructInterface); + testSortCRSUnmanaged(false, doStructInterface); } } @@ -468,24 +525,24 @@ TEST_F(TestCategory, common_sort_crsmatrix) { // because the exec space type is determined from the matrix. if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; - testSortCRS(10, 10, 20, true, doStructInterface, - howExecSpecified); - testSortCRS(100, 100, 2000, true, doStructInterface, - howExecSpecified); - testSortCRS(1000, 1000, 30000, true, doStructInterface, - howExecSpecified); + testSortCRS(10, 10, 20, true, doStructInterface, + howExecSpecified); + testSortCRS(100, 100, 2000, true, doStructInterface, + howExecSpecified); + testSortCRS(1000, 1000, 30000, true, doStructInterface, + howExecSpecified); } - testSortCRSUnmanaged(true, doStructInterface); + testSortCRSUnmanaged(true, doStructInterface); } } TEST_F(TestCategory, common_sort_crs_longrows) { // Matrix/graph with one very long row // Just test this once with graph, and once with matrix - testSortCRS(1, 50000, 10000, false, false, - SortCrsTest::ImplicitType); - testSortCRS(1, 50000, 10000, true, false, - SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, false, false, + SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, true, false, + SortCrsTest::ImplicitType); } TEST_F(TestCategory, common_sort_merge_crsmatrix) { @@ -493,10 +550,14 @@ TEST_F(TestCategory, common_sort_merge_crsmatrix) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) - continue; - testSortAndMerge(false, howExecSpecified, - doStructInterface, testCase); + for (int inPlace = 0; inPlace < 2; inPlace++) { + if (doStructInterface && + howExecSpecified == SortCrsTest::ExplicitType) + continue; + if (doStructInterface && inPlace) continue; + testSortAndMerge(false, howExecSpecified, + doStructInterface, inPlace, testCase); + } } } } @@ -507,10 +568,14 @@ TEST_F(TestCategory, common_sort_merge_crsgraph) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) - continue; - testSortAndMerge(true, howExecSpecified, - doStructInterface, testCase); + for (int inPlace = 0; inPlace < 2; inPlace++) { + if (doStructInterface && + howExecSpecified == SortCrsTest::ExplicitType) + continue; + if (doStructInterface && inPlace) continue; + testSortAndMerge(true, howExecSpecified, + doStructInterface, inPlace, testCase); + } } } } diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index 279f4f89f9..029ddd14b0 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -75,12 +75,12 @@ void doAllCsMat(size_t m, size_t n) { // Test randomly generated Cs matrices TEST_F(TestCategory, sparse_randcsmat) { // Square cases - for (int dim = 1; dim < 1024; dim *= 4) doAllCsMat(dim, dim); + for (int dim = 1; dim < 1024; dim *= 4) doAllCsMat(dim, dim); // Non-square cases for (int dim = 1; dim < 1024; dim *= 4) { - doAllCsMat(dim * 3, dim); - doAllCsMat(dim, dim * 3); + doAllCsMat(dim * 3, dim); + doAllCsMat(dim, dim * 3); } } } // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_Transpose.hpp b/sparse/unit_test/Test_Sparse_Transpose.hpp index 0b9ba1a611..05773b6b75 100644 --- a/sparse/unit_test/Test_Sparse_Transpose.hpp +++ b/sparse/unit_test/Test_Sparse_Transpose.hpp @@ -40,14 +40,13 @@ struct ExactCompare { V v2; }; -template +template void testTranspose(int numRows, int numCols, bool doValues) { + using exec_space = typename device_t::execution_space; using range_pol = Kokkos::RangePolicy; using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; using crsMat_t = typename KokkosSparse::CrsMatrix; using c_rowmap_t = typename crsMat_t::row_map_type; @@ -158,13 +157,11 @@ void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { EXPECT_EQ(size_type(0), valuesDiffs); } -template +template void testTransposeBsrRef() { using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; @@ -236,13 +233,12 @@ void testTransposeBsrRef() { CompareBsrMatrices(At, At_ref); } -template +template void testTransposeBsr(int numRows, int numCols, int blockSize) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using exec_space = typename device_t::execution_space; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; @@ -298,32 +294,32 @@ void testTransposeBsr(int numRows, int numCols, int blockSize) { TEST_F(TestCategory, sparse_transpose_matrix) { // Test both matrix and graph transpose with various sizes - testTranspose(100, 100, true); - testTranspose(500, 50, true); - testTranspose(50, 500, true); - testTranspose(4000, 2000, true); - testTranspose(2000, 4000, true); - testTranspose(2000, 2000, true); + testTranspose(100, 100, true); + testTranspose(500, 50, true); + testTranspose(50, 500, true); + testTranspose(4000, 2000, true); + testTranspose(2000, 4000, true); + testTranspose(2000, 2000, true); } TEST_F(TestCategory, sparse_transpose_graph) { - testTranspose(100, 100, false); - testTranspose(500, 50, false); - testTranspose(50, 500, false); - testTranspose(4000, 2000, false); - testTranspose(2000, 4000, false); - testTranspose(2000, 2000, false); + testTranspose(100, 100, false); + testTranspose(500, 50, false); + testTranspose(50, 500, false); + testTranspose(4000, 2000, false); + testTranspose(2000, 4000, false); + testTranspose(2000, 2000, false); } TEST_F(TestCategory, sparse_transpose_bsr_matrix) { - testTransposeBsrRef(); + testTransposeBsrRef(); // Test bsrMatrix transpose with various sizes - testTransposeBsr(100, 100, 3); - testTransposeBsr(500, 50, 5); - testTransposeBsr(50, 500, 16); - testTransposeBsr(4000, 2000, 3); - testTransposeBsr(2000, 4000, 3); - testTransposeBsr(2000, 2000, 5); + testTransposeBsr(100, 100, 3); + testTransposeBsr(500, 50, 5); + testTransposeBsr(50, 500, 16); + testTransposeBsr(4000, 2000, 3); + testTransposeBsr(2000, 4000, 3); + testTransposeBsr(2000, 2000, 5); } #endif diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp index 58a2a18b8a..d3c3a6134f 100644 --- a/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -159,6 +159,15 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, return; } #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 11600) + { + std::cerr + << "TEST SKIPPED: See " + "https://github.com/kokkos/kokkos-kernels/issues/1965 for details." + << std::endl; + return; + } +#endif using namespace Test; // device::execution_space::initialize(); // device::execution_space::print_configuration(std::cout); diff --git a/sparse/unit_test/Test_Sparse_ccs2crs.hpp b/sparse/unit_test/Test_Sparse_ccs2crs.hpp index 56972b8a07..f7e2797759 100644 --- a/sparse/unit_test/Test_Sparse_ccs2crs.hpp +++ b/sparse/unit_test/Test_Sparse_ccs2crs.hpp @@ -136,19 +136,19 @@ TEST_F(TestCategory, sparse_ccs2crs) { std::srand(ticks); // Empty cases - doCcs2Crs(1, 0, 1, 10); - doCcs2Crs(0, 1, 1, 10); + doCcs2Crs(1, 0, 1, 10); + doCcs2Crs(0, 1, 1, 10); - doCcs2Crs(1, 0, 1, 10); - doCcs2Crs(0, 1, 1, 10); + doCcs2Crs(1, 0, 1, 10); + doCcs2Crs(0, 1, 1, 10); - doCcs2Crs(0, 0, 1, 10); - doCcs2Crs(0, 0, 1, 10); + doCcs2Crs(0, 0, 1, 10); + doCcs2Crs(0, 0, 1, 10); // Square cases for (size_t i = 4; i < 1024; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCcs2crs(dim, dim); + doAllCcs2crs(dim, dim); } // Non-square cases @@ -156,16 +156,16 @@ TEST_F(TestCategory, sparse_ccs2crs) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCcs2crs(m, n); + doAllCcs2crs(m, n); } // Fully sparse cases - doCcs2Crs(5, 5, 1, 10, true); - doCcs2Crs(50, 10, 10, 100, true); + doCcs2Crs(5, 5, 1, 10, true); + doCcs2Crs(50, 10, 10, 100, true); // Test the convenience wrapper that accepts a ccs matrix - RandCsMatrix csMat(2, 2, 10, 10, - false); + RandCsMatrix csMat(2, 2, 10, 10, + false); auto ccsMatrix = crs2ccs(csMat.get_dim1(), csMat.get_dim2(), csMat.get_nnz(), csMat.get_vals(), csMat.get_map(), csMat.get_ids()); auto crsMatrix = ccs2crs(ccsMatrix); diff --git a/sparse/unit_test/Test_Sparse_coo2crs.hpp b/sparse/unit_test/Test_Sparse_coo2crs.hpp index 8a52a39220..3427ec44cd 100644 --- a/sparse/unit_test/Test_Sparse_coo2crs.hpp +++ b/sparse/unit_test/Test_Sparse_coo2crs.hpp @@ -197,10 +197,10 @@ void check_crs_matrix(CrsType crsMat, RowType row, ColType col, DataType data, } } -template +template void doCoo2Crs(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { - RandCooMat cooMat(m, n, m * n, min_val, - max_val); + RandCooMat cooMat(m, n, m * n, min_val, + max_val); auto randRow = cooMat.get_row(); auto randCol = cooMat.get_col(); auto randData = cooMat.get_data(); @@ -242,12 +242,12 @@ TEST_F(TestCategory, sparse_coo2crs) { UINT32_MAX; std::srand(ticks); - doAllCoo2Crs(0, 0); + doAllCoo2Crs(0, 0); // Square cases for (size_t i = 1; i < 256; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCoo2Crs(dim, dim); + doAllCoo2Crs(dim, dim); } // Non-square cases @@ -255,11 +255,11 @@ TEST_F(TestCategory, sparse_coo2crs) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCoo2Crs(m, n); + doAllCoo2Crs(m, n); } - RandCooMat cooMat(2, 2, 2 * 2, 10, - 10); + RandCooMat cooMat(2, 2, 2 * 2, 10, + 10); auto crsMatrix = KokkosSparse::coo2crs(2, 2, cooMat.get_row(), cooMat.get_col(), cooMat.get_data()); auto cooMatrix = KokkosSparse::crs2coo(crsMatrix); @@ -276,15 +276,15 @@ TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { float staticData[16]{7.28411, 8.17991, 8.84304, 5.01788, 9.85646, 5.79404, 8.42014, 1.90238, 8.24195, 4.39955, 3.2637, 5.4546, 6.51895, 8.09302, 9.36294, 3.44206}; - Kokkos::View row("coo row", 16); - Kokkos::View col("coo col", 16); - Kokkos::View data("coo data", 16); + Kokkos::View row("coo row", 16); + Kokkos::View col("coo col", 16); + Kokkos::View data("coo data", 16); - typename Kokkos::View::HostMirror row_h = + typename Kokkos::View::HostMirror row_h = Kokkos::create_mirror_view(row); - typename Kokkos::View::HostMirror col_h = + typename Kokkos::View::HostMirror col_h = Kokkos::create_mirror_view(col); - typename Kokkos::View::HostMirror data_h = + typename Kokkos::View::HostMirror data_h = Kokkos::create_mirror_view(data); for (int i = 0; i < 16; i++) { row_h(i) = staticRow[i]; @@ -329,4 +329,4 @@ TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { auto crsMatFsTs1 = KokkosSparse::coo2crs(m, n, row, col, data); check_crs_matrix(crsMatFsTs1, row_h, col_h, data); } -} // namespace Test \ No newline at end of file +} // namespace Test diff --git a/sparse/unit_test/Test_Sparse_crs2ccs.hpp b/sparse/unit_test/Test_Sparse_crs2ccs.hpp index 720c6cd05e..46cc2fb361 100644 --- a/sparse/unit_test/Test_Sparse_crs2ccs.hpp +++ b/sparse/unit_test/Test_Sparse_crs2ccs.hpp @@ -134,19 +134,19 @@ TEST_F(TestCategory, sparse_crs2ccs) { std::srand(ticks); // Empty cases - doCrs2Ccs(1, 0, 1, 10); - doCrs2Ccs(0, 1, 1, 10); + doCrs2Ccs(1, 0, 1, 10); + doCrs2Ccs(0, 1, 1, 10); - doCrs2Ccs(1, 0, 1, 10); - doCrs2Ccs(0, 1, 1, 10); + doCrs2Ccs(1, 0, 1, 10); + doCrs2Ccs(0, 1, 1, 10); - doCrs2Ccs(0, 0, 1, 10); - doCrs2Ccs(0, 0, 1, 10); + doCrs2Ccs(0, 0, 1, 10); + doCrs2Ccs(0, 0, 1, 10); // Square cases for (size_t i = 4; i < 1024; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCrs2Ccs(dim, dim); + doAllCrs2Ccs(dim, dim); } // Non-square cases @@ -154,16 +154,16 @@ TEST_F(TestCategory, sparse_crs2ccs) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCrs2Ccs(m, n); + doAllCrs2Ccs(m, n); } // Fully sparse cases - doCrs2Ccs(5, 5, 1, 10, true); - doCrs2Ccs(50, 10, 10, 100, true); + doCrs2Ccs(5, 5, 1, 10, true); + doCrs2Ccs(50, 10, 10, 100, true); // Test the convenience wrapper that accepts a crs matrix - RandCsMatrix csMat(2, 2, 10, 10, - false); + RandCsMatrix csMat(2, 2, 10, 10, + false); auto crsMatrix = ccs2crs(csMat.get_dim2(), csMat.get_dim1(), csMat.get_nnz(), csMat.get_vals(), csMat.get_map(), csMat.get_ids()); auto ccsMatrix = crs2ccs(crsMatrix); diff --git a/sparse/unit_test/Test_Sparse_crs2coo.hpp b/sparse/unit_test/Test_Sparse_crs2coo.hpp index 13ff60b0c8..9f81e20f90 100644 --- a/sparse/unit_test/Test_Sparse_crs2coo.hpp +++ b/sparse/unit_test/Test_Sparse_crs2coo.hpp @@ -128,7 +128,7 @@ TEST_F(TestCategory, sparse_crs2coo) { // Square cases for (size_t i = 1; i < 256; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCrs2Coo(dim, dim); + doAllCrs2Coo(dim, dim); } // Non-square cases @@ -136,7 +136,7 @@ TEST_F(TestCategory, sparse_crs2coo) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCrs2Coo(m, n); + doAllCrs2Coo(m, n); } } } // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_csc2csr.hpp b/sparse/unit_test/Test_Sparse_csc2csr.hpp index 61857a3e4f..aa838a4428 100644 --- a/sparse/unit_test/Test_Sparse_csc2csr.hpp +++ b/sparse/unit_test/Test_Sparse_csc2csr.hpp @@ -124,19 +124,19 @@ TEST_F(TestCategory, sparse_csc2csr) { std::srand(ticks); // Empty cases - doCsc2Csr(1, 0, 1, 10); - doCsc2Csr(0, 1, 1, 10); + doCsc2Csr(1, 0, 1, 10); + doCsc2Csr(0, 1, 1, 10); - doCsc2Csr(1, 0, 1, 10); - doCsc2Csr(0, 1, 1, 10); + doCsc2Csr(1, 0, 1, 10); + doCsc2Csr(0, 1, 1, 10); - doCsc2Csr(0, 0, 1, 10); - doCsc2Csr(0, 0, 1, 10); + doCsc2Csr(0, 0, 1, 10); + doCsc2Csr(0, 0, 1, 10); // Square cases for (size_t i = 4; i < 1024; i *= 4) { size_t dim = (std::rand() % 511) + 1; - doAllCsc2csr(dim, dim); + doAllCsc2csr(dim, dim); } // Non-square cases @@ -144,11 +144,11 @@ TEST_F(TestCategory, sparse_csc2csr) { size_t m = (std::rand() % 511) + 1; size_t n = (std::rand() % 511) + 1; while (n == m) n = (std::rand() % 511) + 1; - doAllCsc2csr(m, n); + doAllCsc2csr(m, n); } // Fully sparse cases - doCsc2Csr(5, 5, 1, 10, true); - doCsc2Csr(50, 10, 10, 100, true); + doCsc2Csr(5, 5, 1, 10, true); + doCsc2Csr(50, 10, 10, 100, true); } } // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp new file mode 100644 index 0000000000..327780dec3 --- /dev/null +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -0,0 +1,154 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosSparse_Utils.hpp" +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +void run_test_extract_diagonal_blocks(int nrows, int nblocks) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hm = typename RowMapType::HostMirror; + using EntriesType_hm = typename EntriesType::HostMirror; + using ValuesType_hm = typename ValuesType::HostMirror; + using crsMat_t = CrsMatrix; + + crsMat_t A; + std::vector DiagBlks(nblocks); + + if (nrows != 0) { + // Generate test matrix + const size_type nnz = 2 + (nrows - 2) * 3 + 2; + RowMapType_hm hrow_map("hrow_map", nrows + 1); + EntriesType_hm hentries("hentries", nnz); + ValuesType_hm hvalues("hvalues", nnz); + + // first row + hrow_map(0) = 0; + hentries(0) = 0; + hentries(1) = 1; + hvalues(0) = 0; + hvalues(1) = 1; + // rows in between + int cnt = 2; + for (int i = 1; i <= (nrows - 2); i++) { + hrow_map(i) = cnt; + hentries(cnt) = -1 + i; + hentries(cnt + 1) = 0 + i; + hentries(cnt + 2) = 1 + i; + hvalues(cnt) = -1 + i; + hvalues(cnt + 1) = 0 + i; + hvalues(cnt + 2) = 1 + i; + cnt += 3; + } + // last row + hrow_map(nrows - 1) = cnt; + hentries(nnz - 2) = nrows - 2; + hentries(nnz - 1) = nrows - 1; + hvalues(nnz - 2) = nrows - 2; + hvalues(nnz - 1) = nrows - 1; + // last element of row_map + hrow_map(nrows) = nnz; + + // Allocate A on device memory + RowMapType row_map("row_map", nrows + 1); + EntriesType entries("entries", nnz); + ValuesType values("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map, hrow_map); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); + + // Construct a CRS matrix + A = crsMat_t("CrsMatrix", nrows, nrows, nnz, values, row_map, entries); + } + + // Extract + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, + DiagBlks); + + // Checking + lno_t numRows = 0; + lno_t numCols = 0; + for (int i = 0; i < nblocks; i++) { + numRows += DiagBlks[i].numRows(); + numCols += DiagBlks[i].numCols(); + } + + EXPECT_TRUE(numRows == static_cast(nrows)); + EXPECT_TRUE(numCols == static_cast(nrows)); + + if (nrows > 0) { + bool flag = true; + lno_t col_start = 0; + for (int i = 0; i < nblocks; i++) { + RowMapType_hm hrow_map_diagblk("hrow_map_diagblk", + DiagBlks[i].numRows() + 1); + EntriesType_hm hentries_diagblk("hentries_diagblk", DiagBlks[i].nnz()); + ValuesType_hm hvalues_diagblk("hvalues_diagblk", DiagBlks[i].nnz()); + + Kokkos::deep_copy(hrow_map_diagblk, DiagBlks[i].graph.row_map); + Kokkos::deep_copy(hentries_diagblk, DiagBlks[i].graph.entries); + Kokkos::deep_copy(hvalues_diagblk, DiagBlks[i].values); + + for (int j = 0; j < static_cast(DiagBlks[i].numRows()); j++) { + size_type k1 = hrow_map_diagblk(j); + size_type k2 = hrow_map_diagblk(j + 1); + for (size_type k = k1; k < k2; k++) { + scalar_t col = static_cast(hentries_diagblk(k) + col_start); + scalar_t val = hvalues_diagblk(k); + if (Kokkos::abs(col - val) != 0) { + flag = false; + break; + } + } + if (flag == false) break; + } + if (flag == false) break; + col_start += DiagBlks[i].numCols(); + } + EXPECT_TRUE(flag); + } +} +} // namespace Test + +template +void test_extract_diagonal_blocks() { + for (int s = 1; s <= 8; s++) { + Test::run_test_extract_diagonal_blocks( + 0, s); + Test::run_test_extract_diagonal_blocks( + 12, s); + Test::run_test_extract_diagonal_blocks( + 123, s); + } +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##extract_diagonal_blocks##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_extract_diagonal_blocks(); \ + } + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/sparse/unit_test/Test_Sparse_findRelOffset.hpp b/sparse/unit_test/Test_Sparse_findRelOffset.hpp index 9c7224b756..642f1666e7 100644 --- a/sparse/unit_test/Test_Sparse_findRelOffset.hpp +++ b/sparse/unit_test/Test_Sparse_findRelOffset.hpp @@ -430,13 +430,13 @@ void test_findRelOffset() { #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #undef EXECUTE_TEST diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 358205b713..35fbcb44a4 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -56,7 +56,7 @@ namespace Test { // Run GS on the given vectors, where the handle is already set up. template void run_gauss_seidel( - Handle& kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, + Handle &kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, bool is_symmetric_graph, typename crsMat_t::value_type omega, int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward. ) { @@ -142,6 +142,59 @@ void run_gauss_seidel( kh.destroy_gs_handle(); } +template +void run_gauss_seidel_streams( + std::vector &instances, std::vector &kh, + std::vector &input_mat, std::vector &x_vector, + std::vector &y_vector, bool is_symmetric_graph, + typename crsMat_t::value_type omega, + int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. + int nstreams = 1) { + for (int i = 0; i < nstreams; i++) { + gauss_seidel_symbolic(instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, is_symmetric_graph); + gauss_seidel_numeric(instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, + is_symmetric_graph); + } + + const int apply_count = 2; + for (int i = 0; i < nstreams; i++) { + switch (apply_type) { + case 0: + symmetric_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + case 1: + forward_sweep_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + case 2: + backward_sweep_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + default: + symmetric_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + } + } +} } // namespace Test template crsMat_t; - typedef Kokkos::View scalar_view2d_t; - typedef Kokkos::View + typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View host_scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; @@ -396,7 +449,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, // initial solution is zero Kokkos::deep_copy(x_host, zero); // get the inverse diagonal (only needed on host) - Kokkos::View invDiag("diag^-1", numRows); + Kokkos::View invDiag("diag^-1", numRows); for (lno_t i = 0; i < numRows; i++) { for (size_type j = rowmap(i); j < rowmap(i + 1); j++) { if (entries(j) == i) invDiag(i) = one / values(j); @@ -574,11 +627,11 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries); rowmap_view_t rowmapView( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1); - Kokkos::deep_copy(valuesView, Kokkos::View( + Kokkos::deep_copy(valuesView, Kokkos::View( values.data(), totalEntries)); - Kokkos::deep_copy(entriesView, Kokkos::View( + Kokkos::deep_copy(entriesView, Kokkos::View( entries.data(), totalEntries)); - Kokkos::deep_copy(rowmapView, Kokkos::View( + Kokkos::deep_copy(rowmapView, Kokkos::View( rowmap.data(), numRows + 1)); crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); @@ -662,58 +715,201 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { EXPECT_LT(result_norm_res, 0.25 * initial_norm_res); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank1(2000, 2000 * 20, \ - 200, 10, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank1(2000, 2000 * 20, \ - 200, 10, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_empty(); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_balloon_clustering(5000, 100, 2000); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_sequential_sor(1000, 1000 * 15, 50, \ - 10); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_long_rows(500, 10, 20, \ - true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_custom_coloring(500, \ - 10); \ +template +void test_gauss_seidel_streams_rank1( + lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, + bool symmetric, double omega, + KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, + int nstreams = 1) { + using namespace Test; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using mag_t = typename Kokkos::ArithTraits::mag_type; + using execution_space = typename device::execution_space; + + using const_size_type = const size_type; + using const_lno_t = const lno_t; + using const_scalar_t = const scalar_t; + using KernelHandle = + KokkosKernelsHandle; + srand(245); + lno_t numCols = numRows; + typename crsMat_t::value_type m_omega = omega; + +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same_v) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + std::cerr << "TEST SKIPPED: Not enough concurrency to partition " + "execution space. exec_concurrency: " + << exec_concurrency << std::endl; + return; + } + } +#endif // KOKKOS_ENABLE_OPENMP + + std::vector instances; + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector input_mat_v(nstreams); + std::vector solution_x_v(nstreams); + std::vector x_vector_v(nstreams); + std::vector y_vector_v(nstreams); + std::vector initial_norm_res_v(nstreams); + + const scalar_t one = Kokkos::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); + + for (int i = 0; i < nstreams; i++) { + input_mat_v[i] = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); + + if (symmetric) { + // Symmetrize on host, rather than relying on the parallel versions (those + // can be tested for symmetric=false) + input_mat_v[i] = + Test::symmetrize( + input_mat_v[i]); + } + lno_t nv = input_mat_v[i].numRows(); + scalar_view_t solution_x_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); + solution_x_v[i] = solution_x_tmp; + create_random_x_vector(solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); + y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); + // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the + // behavior of each algorithm _should be_ the same on every execution space, + // which is why we just test GS_DEFAULT. + + scalar_view_t x_vector_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); + x_vector_v[i] = x_vector_tmp; + + kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + } + + int apply_count = 3; // test symmetric, forward, backward + //*** Point-coloring version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + + run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + + for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_empty(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_balloon_clustering(5000, 100, 2000); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_sequential_sor(1000, 1000 * 15, 50, \ + 10); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_long_rows(500, 10, 20, \ + true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_custom_coloring(500, \ + 10); \ } #include diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index f6e4d0bc84..4b5b65aeb3 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -16,8 +16,8 @@ #include #include - #include "KokkosSparse_mdf.hpp" +#include "KokkosSparse_CrsMatrix.hpp" namespace Test { diff --git a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp index b5c57dbe49..52a9a1874b 100644 --- a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp +++ b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp @@ -91,9 +91,7 @@ Matrix loadMatrixFromVectors(int numRows, int numCols, template void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { - using Offset = typename Matrix::size_type; - using Device = - Kokkos::Device; + using Offset = typename Matrix::size_type; bool haveHardcodedReference = true; switch (test) { case 0: { @@ -226,7 +224,8 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { if (haveHardcodedReference) { Matrix Afiltered_refimpl = removeMatrixZerosReference(A); bool referenceImplMatchesHardcoded = - Test::is_same_matrix(Afiltered_ref, Afiltered_refimpl); + Test::is_same_matrix(Afiltered_ref, + Afiltered_refimpl); ASSERT_TRUE(referenceImplMatchesHardcoded) << "Test case " << test << ": reference impl gave wrong answer!"; } @@ -236,15 +235,13 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { void testRemoveCrsMatrixZeros(int testCase) { using namespace TestRemoveCrsMatrixZeros; - using Device = - Kokkos::Device; - using Matrix = KokkosSparse::CrsMatrix; + using Matrix = KokkosSparse::CrsMatrix; Matrix A, Afiltered_ref; getTestInput(testCase, A, Afiltered_ref); Matrix Afiltered_actual = KokkosSparse::removeCrsMatrixZeros(A); bool matches = - Test::is_same_matrix(Afiltered_actual, Afiltered_ref); + Test::is_same_matrix(Afiltered_actual, Afiltered_ref); EXPECT_TRUE(matches) << "Test case " << testCase << ": matrix with zeros filtered out does not match reference."; diff --git a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp index 98affff57d..224b72e2b7 100644 --- a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp +++ b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp @@ -490,9 +490,7 @@ void test_replaceSumIntoLonger() { // FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name" #ifndef KOKKOS_ENABLE_SYCL - #include +#endif // KOKKOS_ENABLE_SYCL #undef KOKKOSKERNELS_EXECUTE_TEST - -#endif // KOKKOS_ENABLE_SYCL diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 9da0733581..990fcc1a30 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -89,9 +89,34 @@ struct fSPMV { if (error > eps * max_val) { err++; +#if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i, + "expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); +#else + Kokkos::printf("expected_y(%d)=%f, y(%d)=%f err=%e, max_error=%e\n", i, + AT::abs(expected_y(i)), i, AT::abs(y(i)), error, + eps * max_val); +#endif + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, value_type &err) const { + const mag_type error = AT::abs(expected_y(i, j) - y(i, j)); + + if (error > eps * max_val) { + err++; +#if KOKKOS_VERSION < 40199 + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", i, j, + AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), error, + eps * max_val); +#else + Kokkos::printf("expected_y(%d,%d)=%f, y(%d,%d)=%f err=%e, max_error=%e\n", + i, j, AT::abs(expected_y(i, j)), i, j, AT::abs(y(i, j)), + error, eps * max_val); +#endif } } }; @@ -100,7 +125,7 @@ template void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, - char mode = 'N') { + const std::string &mode = "N") { using graph_t = typename crsMat_t::StaticCrsGraphType; using size_type_view_t = typename graph_t::row_map_type; using lno_view_t = typename graph_t::entries_type; @@ -111,8 +136,6 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using scalar_t = typename scalar_view_t::non_const_value_type; using KAT = Kokkos::ArithTraits; - mode = toupper(mode); - typename scalar_view_t::HostMirror h_values = Kokkos::create_mirror_view(input_mat.values); Kokkos::deep_copy(h_values, input_mat.values); @@ -143,13 +166,13 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, for (size_type j = h_rowmap(row); j < h_rowmap(row + 1); ++j) { lno_t col = h_entries(j); scalar_t val = h_values(j); - if (mode == 'N') + if (mode == "N") h_y(row) += alpha * val * h_x(col); - else if (mode == 'C') + else if (mode == "C") h_y(row) += alpha * KAT::conj(val) * h_x(col); - else if (mode == 'T') + else if (mode == "T") h_y(col) += alpha * val * h_x(row); - else if (mode == 'H') + else if (mode == "H") h_y(col) += alpha * KAT::conj(val) * h_x(row); } } @@ -159,12 +182,14 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv( - const Controls &controls, crsMat_t input_mat, x_vector_type x, - y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, char mode, + const KokkosKernels::Experimental::Controls &controls, crsMat_t input_mat, + x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, const std::string &mode, typename Kokkos::ArithTraits::mag_type max_val) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; + EXPECT_TRUE(mode.size() == 1); + using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -173,7 +198,7 @@ void check_spmv( const y_value_mag_type eps = 10 * Kokkos::ArithTraits::eps(); - bool transposed = (mode == 'T') || (mode == 'H'); + bool transposed = (mode == "T") || (mode == "H"); y_vector_type expected_y( "expected", transposed ? input_mat.numCols() : input_mat.numRows()); Kokkos::deep_copy(expected_y, y); @@ -183,7 +208,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(controls, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -209,9 +234,12 @@ void check_spmv_mv( crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV, char mode, + typename y_vector_type::non_const_value_type beta, int numMV, + const std::string &mode, typename Kokkos::ArithTraits::mag_type max_val) { + EXPECT_TRUE(mode.size() == 1); + using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -231,7 +259,7 @@ void check_spmv_mv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -424,8 +452,9 @@ Kokkos::complex randomUpperBound>(int mag) { template -void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, - lno_t bandwidth, lno_t row_size_variance, bool heavy) { +void test_spmv(const KokkosKernels::Experimental::Controls &controls, + lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -466,12 +495,12 @@ void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, Kokkos::fill_random(input_mat.values, rand_pool, randomUpperBound(max_val)); - std::vector nonTransModes = {'N'}; - std::vector transModes = {'T'}; - std::vector testAlphaBeta = {0.0, 1.0}; + std::vector nonTransModes = {"N"}; + std::vector transModes = {"T"}; + std::vector testAlphaBeta = {0.0, 1.0}; if (heavy) { - nonTransModes.push_back('C'); - transModes.push_back('H'); + nonTransModes.push_back("C"); + transModes.push_back("H"); testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } @@ -503,17 +532,29 @@ template ( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } { - Controls controls; + KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "native"); test_spmv( controls, numRows, nnz, bandwidth, row_size_variance, heavy); } + { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "merge"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } + { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "native-merge"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } } template nonTransModes = {'N'}; - std::vector transModes = {'T'}; - std::vector testAlphaBeta = {0.0, 1.0}; + std::vector nonTransModes = {"N"}; + std::vector transModes = {"T"}; + std::vector testAlphaBeta = {0.0, 1.0}; if (heavy) { - nonTransModes.push_back('C'); - transModes.push_back('H'); + nonTransModes.push_back("C"); + transModes.push_back("H"); testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } @@ -637,18 +678,18 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::deep_copy(b_y_copy, b_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "N", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "N", max_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N", max_y + max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "T", max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T', + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "T", max_y); // Testing all modes together, since matrix is square - std::vector modes = {'N', 'C', 'T', 'H'}; + std::vector modes = {"N", "C", "T", "H"}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; for (auto mode : modes) { for (double alpha : testAlphaBeta) { @@ -919,7 +960,8 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { template void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, - const Controls &controls = Controls()) { + const KokkosKernels::Experimental::Controls &controls = + KokkosKernels::Experimental::Controls()) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -962,7 +1004,7 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, template void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { - Controls controls; + KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "native"); test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); } // test_spmv_native @@ -1112,487 +1154,121 @@ void test_github_issue_101() { } } -template -CrsMat make_block_matrix(typename CrsMat::ordinal_type &numRows, - typename CrsMat::ordinal_type &numCols, - typename CrsMat::ordinal_type &blockSize) { - using lno_t = typename CrsMat::ordinal_type; - using scalar_t = typename CrsMat::value_type; - - using Kokkos::HostSpace; - using Kokkos::MemoryUnmanaged; - using Kokkos::View; - - Kokkos::Random_XorShift64 rand(13718); - - // fill outputs with random values - // Kokkos::Random_XorShift64_Pool rand_pool(13718); - // Kokkos::fill_random(hi_x, rand_pool, randomUpperBound(10)); - - std::vector values; - std::vector rowmap; - std::vector entries; - - // each row of blocks - for (lno_t bi = 0; bi < numRows; bi += blockSize) { - // target number of blocks in the row - lno_t rowBlockCount = 3; - { - // cap the number of blocks in the row - lno_t maxBlocksInRow = numCols / blockSize; - rowBlockCount = std::min(maxBlocksInRow, rowBlockCount); - } - - // where the blocks in this row of blocks start - // add that many blocks at random positions in the row - std::vector bjs; - for (int _ = 0; _ < rowBlockCount; ++_) { - bjs.push_back(rand.rand(numCols / blockSize) * blockSize); - } - - // remove duplicates - { - std::sort(bjs.begin(), bjs.end()); - auto it = std::unique(bjs.begin(), bjs.end()); - bjs.resize(it - bjs.begin()); - } - - for (lno_t i = bi; i < bi + blockSize; ++i) { - rowmap.push_back(entries.size()); // where this row starts - - // for each block - for (size_t block = 0; block < bjs.size(); ++block) { - lno_t bj = bjs[block]; - for (lno_t j = bj; j < bj + blockSize; ++j) { - entries.push_back(j); - values.push_back(rand.rand(10)); - // values.push_back(1); - } - } - } - } - - while (rowmap.size() < numRows + 1) { - rowmap.push_back(entries.size()); - } - - return CrsMat("", numRows, numCols, values.size(), values.data(), - rowmap.data(), entries.data()); -} - -struct Coordinate { - int i; - int j; - Coordinate(int _i, int _j) : i(_i), j(_j) {} - // sort by i then j - static bool by_ij(const Coordinate &a, const Coordinate &b) { - if (a.i < b.i) { - return true; - } else if (a.i > b.i) { - return false; - } else { - return a.j < b.j; - } - } -}; -struct Entry { - Coordinate c; - double e; - Entry(int i, int j, double _e) : c(i, j), e(_e) {} - static bool by_ij(const Entry &a, const Entry &b) { - return Coordinate::by_ij(a.c, b.c); - } -}; - -// expand a pattern into a blocked CrsMatrix -template ::value, bool> = true> -Matrix expand_matrix(std::vector pattern, const int m, const int k, - const int blockSize, const int seed = 0) { - typedef typename Matrix::value_type Scalar; - typedef typename Matrix::ordinal_type Ordinal; - typedef typename Matrix::non_const_size_type Offset; - typedef Kokkos::View> - UnmanagedRowmap; - typedef Kokkos::View> - UnmanagedEntries; - typedef Kokkos::View> - UnmanagedValues; - - srand(seed); - - auto gen_rand = []() -> double { return rand() % 10; }; - - // check rows and columns - for (const Coordinate &c : pattern) { - if (c.i >= m) { - KokkosKernels::Impl::throw_runtime_exception("i exceeded matrix rows"); - } - if (c.j >= k) { - KokkosKernels::Impl::throw_runtime_exception("j exceeded matrix cols"); - } - } - - // order the blocks - std::sort(pattern.begin(), pattern.end(), Coordinate::by_ij); - - // create coo entries for each block - std::vector entries; - for (const Coordinate &c : pattern) { - for (int i = 0; i < blockSize; ++i) { - for (int j = 0; j < blockSize; ++j) { - entries.push_back( - Entry(c.i * blockSize + i, c.j * blockSize + j, gen_rand())); - } - } - } - - std::sort(entries.begin(), entries.end(), Entry::by_ij); - - std::vector rowMap; - std::vector colInd; - std::vector val; - - for (Entry &e : entries) { - while (rowMap.size() < size_t(e.c.i + 1)) { // catch empty rows - rowMap.push_back(colInd.size()); - } - colInd.push_back(e.c.j); - val.push_back(e.e); - } - // possibly empty rows at end of matrix - while (rowMap.size() <= size_t(m * blockSize)) { - rowMap.push_back(colInd.size()); - } - - typename Matrix::row_map_type::non_const_type sparseRowMap("", rowMap.size()); - Kokkos::deep_copy(sparseRowMap, - UnmanagedRowmap(rowMap.data(), rowMap.size())); - typename Matrix::index_type::non_const_type sparseCols("", colInd.size()); - Kokkos::deep_copy(sparseCols, UnmanagedEntries(colInd.data(), colInd.size())); - typename Matrix::values_type::non_const_type sparseVals("", val.size()); - Kokkos::deep_copy(sparseVals, UnmanagedValues(val.data(), val.size())); - - Matrix mat("crs", m * blockSize, k * blockSize, sparseVals.size(), sparseVals, - sparseRowMap, sparseCols); - return mat; -} - -template < - typename Matrix, - std::enable_if_t::value, - bool> = true> -Matrix expand_matrix(std::vector pattern, const int m, const int k, - const int blockSize, const int seed = 0) { - typedef typename Matrix::value_type Scalar; - typedef typename Matrix::ordinal_type Ordinal; - typedef typename Matrix::non_const_size_type Offset; - typedef Kokkos::View> - UnmanagedRowmap; - typedef Kokkos::View> - UnmanagedEntries; - typedef Kokkos::View> - UnmanagedValues; - - srand(seed); - - auto gen_rand = []() -> double { return rand() % 10; }; - - // determine the number of rows and columns - // check rows and columns - for (const Coordinate &c : pattern) { - if (c.i >= m) { - KokkosKernels::Impl::throw_runtime_exception("i exceeded matrix rows"); - } - if (c.j >= k) { - KokkosKernels::Impl::throw_runtime_exception("j exceeded matrix cols"); - } - } - - // order the blocks - std::sort(pattern.begin(), pattern.end(), Coordinate::by_ij); - - // create values in order of the blocks (storage order for BSR) - std::vector val(pattern.size() * blockSize * blockSize); - for (typename std::vector::size_type idx = 0; idx < val.size(); - ++idx) { - val[idx] = gen_rand(); - } - - /* create the BsrMatrix adjacency info - use the sorted pattern. val is already in the correct storage order - */ - std::vector rowMap; - std::vector colInd; - - for (Coordinate &e : pattern) { - while (rowMap.size() < size_t(e.i + 1)) { // catch empty rows - rowMap.push_back(colInd.size()); - } - colInd.push_back(e.j); - } - // possibly empty rows at end of matrix - while (rowMap.size() <= size_t(m)) { - rowMap.push_back(colInd.size()); - } - - typename Matrix::row_map_type::non_const_type sparseRowMap("", rowMap.size()); - Kokkos::deep_copy(sparseRowMap, - UnmanagedRowmap(rowMap.data(), rowMap.size())); - typename Matrix::index_type::non_const_type sparseCols("", colInd.size()); - Kokkos::deep_copy(sparseCols, UnmanagedEntries(colInd.data(), colInd.size())); - typename Matrix::values_type::non_const_type sparseVals("", val.size()); - Kokkos::deep_copy(sparseVals, UnmanagedValues(val.data(), val.size())); - Kokkos::fence(); - - Matrix mat("bsr", m, k, sparseVals.size(), sparseVals, sparseRowMap, - sparseCols, blockSize); - return mat; -} - -/* a_scalar_t: the matrix type - x_scalar_t: the x-vector type - y_scalar_t: the y-vector type - - blockSize: the size of the dense blocks in the matrix - pattern: the non-zero locations of the blocks - m,n: the multiplication dimensions (in terms of blockSize) - k: number of vectors in the multivector - y[m*blockSize x k] = A[m*blockSize x n*blockSize] * x[n*blockSize x k] - - Compare the BsrMatrix spmv against a KokkosSparse::spmv on the same operands. - The controls are used in the BsrMatrix SpMV invocation - -*/ -template -void test_spmv_bsrmatrix_controls_pattern( - const KokkosKernels::Experimental::Controls &controls, - const std::vector &pattern, const int m, const int n, - lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta, - const int max_blocks_per_row) { - // get the widest passed scalar type - // typedef typename std::conditional= sizeof(x_scalar_t), - // a_scalar_t, x_scalar_t>::type wider_t; - // typedef typename std::conditional= sizeof(y_scalar_t), - // wider_t, y_scalar_t>::type widest_t; - - using crs_mat_t = typename KokkosSparse::CrsMatrix; - using bsr_mat_t = - typename KokkosSparse::Experimental::BsrMatrix; - using x_view_t = Kokkos::View; - using y_view_t = Kokkos::View; - - using DeviceRangePolicy = Kokkos::RangePolicy; - - crs_mat_t crs = expand_matrix(pattern, m, n, blockSize); - bsr_mat_t bsr = expand_matrix(pattern, m, n, blockSize); - - // only tue if the original matrix is a multiple of block size, and all blocks - // are dense - EXPECT_TRUE(bsr.nnz() * bsr.blockDim() * bsr.blockDim() == crs.nnz()); - EXPECT_TRUE(bsr.numRows() * bsr.blockDim() == crs.numRows()); - EXPECT_TRUE(bsr.numCols() * bsr.blockDim() == crs.numCols()); - - // expected operands - x_view_t exp_x("exp_x", n * blockSize, k); - y_view_t exp_y("exp_y", m * blockSize, k); - - // test operands - y_view_t test_y("test_y", m * blockSize, k); - x_view_t test_x("test_x", n * blockSize, k); - - constexpr x_scalar_t max_x = 10; - constexpr y_scalar_t max_y = 10; - constexpr a_scalar_t max_a = 10; - const double max_val = - beta * max_y + alpha * max_blocks_per_row * max_a * max_x; - - // fill expected with random values - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - Kokkos::fill_random(exp_x, rand_pool, - randomUpperBound(max_x)); - Kokkos::fill_random(exp_y, rand_pool, - randomUpperBound(max_y)); - - // copy expected operands to test operands - Kokkos::deep_copy(test_x, exp_x); - Kokkos::deep_copy(test_y, exp_y); - Kokkos::fence(); - - // generate expected y vector - // some error about Blas implementation - KokkosSparse::spmv("N", alpha, crs, exp_x, beta, exp_y); - Kokkos::fence(); - - // invoke tensor-core spmv - KokkosSparse::spmv(controls, "N", alpha, bsr, test_x, beta, test_y); - Kokkos::fence(); - - // test each vector - for (lno_t ki = 0; ki < k; ++ki) { - auto exp_y_i = Kokkos::subview(exp_y, Kokkos::ALL(), ki); - auto test_y_i = Kokkos::subview(test_y, Kokkos::ALL(), ki); - - // count errors +template +void test_spmv_all_interfaces_light() { + // Using a small matrix, run through the various SpMV interfaces and + // make sure they produce the correct results. + using execution_space = typename DeviceType::execution_space; + using mag_t = typename Kokkos::ArithTraits::mag_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + const lno_t m = 111; + const lno_t n = 99; + const mag_t maxVal = 10.0; + const mag_t eps = 10.0 * Kokkos::ArithTraits::eps(); + size_type nnz = 600; + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + m, n, nnz, 2, lno_t(n * 0.7)); + // note: A's values are in range [0, 50) + const mag_t maxError = (nnz / m) * 50.0 * maxVal; + using multivector_t = Kokkos::View; + using vector_t = Kokkos::View; + using range1D_t = Kokkos::RangePolicy; + using range2D_t = Kokkos::MDRangePolicy>; + multivector_t x_mv("x_mv", n, 3); + vector_t x("x", n); + // Randomize x (it won't be modified after that) + Kokkos::fill_random(x_mv, rand_pool, randomUpperBound(maxVal)); + Kokkos::fill_random(x, rand_pool, randomUpperBound(maxVal)); + multivector_t y_mv("y_mv", m, 3); + vector_t y("y", m); + // Compute the correct y = Ax once + multivector_t ygold_mv("ygold_mv", m, 3); + vector_t ygold("ygold", m); + for (lno_t i = 0; i < 3; i++) + Test::sequential_spmv(A, Kokkos::subview(x_mv, Kokkos::ALL(), i), + Kokkos::subview(ygold_mv, Kokkos::ALL(), i), 1.0, + 0.0); + Test::sequential_spmv(A, x, ygold, 1.0, 0.0); + auto clear_y = [&]() { Kokkos::deep_copy(y_mv, scalar_t(0)); }; + auto verify = [&]() { + int num_errors = 0; + Kokkos::parallel_reduce( + "KokkosSparse::Test::spmv", range1D_t(0, m), + Test::fSPMV(ygold, y, eps, maxError), num_errors); + EXPECT_EQ(num_errors, 0); + }; + auto verify_mv = [&]() { int num_errors = 0; - // Kokkos::ArithTraits in CUDA 9 is float on the host - // for CUDA 9, Kokkos half is actually float. However, the tensor core SpMV - // uses CUDA's half type, not Kokkos, so we still need a reduced precision - // test. - double eps = - 2 * KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX; - Kokkos::parallel_reduce("KokkosSparse::Test::spmv_tc", - DeviceRangePolicy(0, exp_y_i.extent(0)), - Test::fSPMV( - exp_y_i, test_y_i, eps, max_val), + Kokkos::parallel_reduce("KokkosSparse::Test::spmv", + range2D_t({0, 0}, {m, 3}), + Test::fSPMV( + ygold_mv, y_mv, eps, maxError), num_errors); - // explicit cast to double since no overload for half::operator<< - if (num_errors > 0) - std::cout << "KokkosSparse::Test::spmv_tc: " << num_errors - << " errors of " << exp_y_i.extent_int(0) << " for mv " << ki - << " (alpha=" - << double(Kokkos::ArithTraits::abs(alpha)) - << ", beta=" - << double(Kokkos::ArithTraits::abs(beta)) - << ", mode = N" - << ")\n"; - EXPECT_TRUE(num_errors == 0); - } -} - -/* test a particular pattern with all supported controls - */ -template -void test_spmv_bsrmatrix_pattern(const std::vector &pattern, - const int m, const int n, lno_t blockSize, - lno_t k, y_scalar_t alpha, y_scalar_t beta, - const int max_blocks_per_row) { - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "experimental_bsr_tc"); - test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); + EXPECT_EQ(num_errors, 0); + }; + // Now run through the interfaces and check results each time. + execution_space space; + std::vector space_partitions; + if (space.concurrency() > 1) { + space_partitions = Kokkos::Experimental::partition_space(space, 1, 1); + space = space_partitions[1]; } - -#if defined(KOKKOS_ARCH_AMPERE) - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "experimental_bsr_tc"); - controls.setParameter("tc_precision", "double"); - test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } -#endif -} - -/* test a bunch of different matrices - */ -template -void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, - y_scalar_t beta) { KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "experimental_bsr_tc"); - - // 1x1 full - { - int m = 1; - int n = 1; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(0, 0)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 1x1 empty - { - int m = 1; - int n = 1; - int max_blocks_per_row = 0; - std::vector pattern = {}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x2 top-left - { - int m = 2; - int n = 2; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(0, 0)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x2 bottom right - { - int m = 2; - int n = 2; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(1, 1)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x3 bottom right - { - int m = 2; - int n = 3; - int max_blocks_per_row = 1; - std::vector pattern = {Coordinate(1, 2)}; - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 2x10 long bottom row - { - int m = 2; - int n = 10; - int max_blocks_per_row = 10; - std::vector pattern; - for (int j = 0; j < n; ++j) { - pattern.push_back(Coordinate(1, j)); - } - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } - - // 10x10 column 1 + diagonal - { - int m = 10; - int n = 10; - int max_blocks_per_row = 2; - std::vector pattern; - for (int i = 0; i < n; ++i) { - pattern.push_back(Coordinate(i, 1)); - if (i != 1) { - pattern.push_back(Coordinate(i, i)); - } - } - test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); - } + // All tagged versions + KokkosSparse::spmv(space, controls, "N", 1.0, A, x, 0.0, y, + KokkosSparse::RANK_ONE()); + space.fence(); + verify(); + clear_y(); + KokkosSparse::spmv(controls, "N", 1.0, A, x, 0.0, y, + KokkosSparse::RANK_ONE()); + verify(); + clear_y(); + KokkosSparse::spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv, + KokkosSparse::RANK_TWO()); + space.fence(); + verify_mv(); + clear_y(); + KokkosSparse::spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv, + KokkosSparse::RANK_TWO()); + verify_mv(); + clear_y(); + // Non-tagged versions + // space and controls + spmv(space, controls, "N", 1.0, A, x, 0.0, y); + space.fence(); + verify(); + clear_y(); + spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv); + space.fence(); + verify_mv(); + clear_y(); + // controls + spmv(controls, "N", 1.0, A, x, 0.0, y); + verify(); + clear_y(); + spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv); + verify_mv(); + clear_y(); + // space + spmv(space, "N", 1.0, A, x, 0.0, y); + space.fence(); + verify(); + clear_y(); + spmv(space, "N", 1.0, A, x_mv, 0.0, y_mv); + space.fence(); + verify_mv(); + clear_y(); + // neither + spmv("N", 1.0, A, x, 0.0, y); + verify(); + clear_y(); + spmv("N", 1.0, A, x_mv, 0.0, y_mv); + verify_mv(); + clear_y(); } #define EXECUTE_TEST_ISSUE_101(DEVICE) \ @@ -1619,6 +1295,14 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, 100, 5); \ } +#define EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse_spmv_interfaces_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + test_spmv_all_interfaces_light(); \ + } + #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ TEST_F( \ TestCategory, \ @@ -1664,132 +1348,14 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, 10, 2); \ } -/* Tensor Core SpMV - blocksize, k, alpha, beta -*/ -#define EXECUTE_TEST_TC(ASCALAR, XSCALAR, YSCALAR, ORDINAL, OFFSET, LAYOUT, \ - DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##spmv_tensor_core##_##ASCALAR##_##XSCALAR##_##YSCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ - /* easy case with different alphas and betas*/ \ - test_spmv_bsrmatrix(16, 16, 0, 0); \ - test_spmv_bsrmatrix(16, 16, 1, 0); \ - test_spmv_bsrmatrix(16, 16, 0, 1); \ - test_spmv_bsrmatrix(16, 16, 1, 1); \ - /* easy case with a real alpha/beta */ \ - test_spmv_bsrmatrix(16, 16, 1.25, -2.73); \ - /* smaller block size with k < and > block size*/ \ - test_spmv_bsrmatrix(7, 6, 1.25, -2.73); \ - test_spmv_bsrmatrix(7, 7, 1.25, -2.73); \ - test_spmv_bsrmatrix(7, 8, 1.25, -2.73); \ - /* smaller block size with k < and > block size*/ \ - test_spmv_bsrmatrix(15, 14, 1.25, -2.73); \ - test_spmv_bsrmatrix(15, 15, 1.25, -2.73); \ - test_spmv_bsrmatrix(15, 16, 1.25, -2.73); \ - /* larger block size with k < and > block size*/ \ - test_spmv_bsrmatrix(17, 16, 1.25, -2.73); \ - test_spmv_bsrmatrix(17, 17, 1.25, -2.73); \ - test_spmv_bsrmatrix(17, 18, 1.25, -2.73); \ - /* larger block size with k < and > block size*/ \ - test_spmv_bsrmatrix(32, 31, 1.25, -2.73); \ - test_spmv_bsrmatrix(32, 32, 1.25, -2.73); \ - test_spmv_bsrmatrix(32, 33, 1.25, -2.73); \ - /* more than one team per block*/ \ - test_spmv_bsrmatrix(33, 13, 1.25, -2.73); \ - test_spmv_bsrmatrix(33, 27, 1.25, -2.73); \ - test_spmv_bsrmatrix(33, 41, 1.25, -2.73); \ - } - -// minimal conditions for tensor core SpMV test -// BsrMatrix spmv is only supported on CUDA for the time being -#if defined(KOKKOS_ENABLE_CUDA) && defined(TEST_CUDA_SPARSE_CPP) && \ - (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE)) - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, float, int, size_t, LayoutLeft, -// TestExecSpace) EXECUTE_TEST_TC(kokkos_half, float, float, int, -// size_t, LayoutLeft, TestExecSpace) EXECUTE_TEST_TC(float, kokkos_half, -// float, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_TC(float, float, float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, double, int, size_t, LayoutLeft, -// TestExecSpace) EXECUTE_TEST_TC(kokkos_half, double, double, int, -// size_t, LayoutLeft, TestExecSpace) EXECUTE_TEST_TC(double, kokkos_half, -// double, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, float, int, size_t, LayoutRight, -// TestExecSpace) EXECUTE_TEST_TC(kokkos_half, float, float, int, -// size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_TC(float, kokkos_half, -// float, int, size_t, LayoutRight, TestExecSpace) -EXECUTE_TEST_TC(float, float, float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -// EXECUTE_TEST_TC(kokkos_half, kokkos_half, double, int, size_t, -// LayoutRight, TestExecSpace) EXECUTE_TEST_TC(kokkos_half, double, double, -// int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_TC(double, -// kokkos_half, double, int, size_t, LayoutRight, TestExecSpace) -EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#endif // tensor core SpMV tests - -#undef EXECUTE_TEST_TC - #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_ISSUE_101(TestExecSpace) +EXECUTE_TEST_ISSUE_101(TestDevice) #endif #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ - EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestDevice) \ + EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestDevice) #include @@ -1799,9 +1365,10 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ - EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) \ + EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) \ + EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) #include @@ -1814,7 +1381,8 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) \ + EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) #include diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index b2883c1e91..5b823a22f7 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -16,7 +16,7 @@ /*! \file Test_Sparse_spmv_bsr.hpp - Test the following 768 combos for at least a few matcies. + Test the following 256 combos for at least a few matcies. Algorithms Alpha Beta Block Sizes Modes (none) 0 0 1 N @@ -25,6 +25,8 @@ 3.7 -1.5 9 H There are also a subset of tests on larger matrices + + Multivector products are also tested for these cases with 1 and 7 vectors */ #include @@ -51,6 +53,29 @@ using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; +/* Poor-man's std::optional since CUDA 11.0 seems to have an ICE + https://github.com/kokkos/kokkos-kernels/issues/1943 +*/ +struct OptCtrls { + bool present_; + KokkosKernels::Experimental::Controls ctrls_; + + OptCtrls() : present_(false) {} + OptCtrls(const KokkosKernels::Experimental::Controls &ctrls) + : present_(true), ctrls_(ctrls) {} + + operator bool() const { return present_; } + + constexpr const KokkosKernels::Experimental::Controls &operator*() + const &noexcept { + return ctrls_; + } + constexpr const KokkosKernels::Experimental::Controls *operator->() const + noexcept { + return &ctrls_; + } +}; + namespace Test_Spmv_Bsr { /*! \brief Maximum value used to fill A */ @@ -82,6 +107,23 @@ inline bool mode_is_transpose(const char *mode) { return mode[0] == 'T' || mode[0] == 'H'; } +/*! \brief Get the max nonzeros (not max nonzero _blocks_) per row of Op(A) */ +template +inline size_t opMaxNnzPerRow(const Bsr &A, bool trans) { + if (trans) { + auto At = KokkosSparse::Impl::transpose_bsr_matrix(A); + return At.blockDim() * + (size_t)KokkosSparse::Impl::graph_max_degree< + typename Bsr::execution_space, typename Bsr::ordinal_type>( + At.graph.row_map); + } else { + return A.blockDim() * + (size_t)KokkosSparse::Impl::graph_max_degree< + typename Bsr::execution_space, typename Bsr::ordinal_type>( + A.graph.row_map); + } +} + /*! \brief 0x0 matrix */ template Bsr bsr_corner_case_0_by_0(const int blockSize) { @@ -126,49 +168,31 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { return KokkosSparse::Impl::expand_crs_to_bsr(crs, blockSize); } -/*! \brief reference SpMV is the KokkosSparse::spmv on the equivalent point - * matrix - */ -template -void reference_spmv(const char *mode, const Alpha &alpha, const Bsr &a, - const XVector &x, const Beta &beta, const YVector &y) { - using Crs = KokkosSparse::CrsMatrix< - typename Bsr::non_const_value_type, typename Bsr::non_const_ordinal_type, - typename Bsr::device_type, void, typename Bsr::non_const_size_type>; - const Crs crs = KokkosSparse::Impl::bsr_to_crs(a); - - KokkosSparse::spmv(mode, alpha, crs, x, beta, y); -} - /*! \brief test a specific spmv */ -template -void test_spmv(const char *alg, const char *mode, const Alpha &alpha, - const Beta &beta, const Bsr &a, const XVector &x, - const YVector &y) { - using execution_space = typename Bsr::execution_space; - using scalar_type = typename Bsr::non_const_value_type; - using ordinal_type = typename Bsr::non_const_ordinal_type; - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; +void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow, const XVector &x, const YVector &y) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; - // generate expected result from reference implementation + // generate expected result from reference (CRS) implementation YVector yExp("yExp", y.extent(0)); Kokkos::deep_copy(yExp, y); - reference_spmv(mode, alpha, a, x, beta, yExp); + KokkosSparse::spmv(mode, alpha, acrs, x, beta, yExp); // scratch space for actual value (don't modify input) YVector yAct("yAct", y.extent(0)); Kokkos::deep_copy(yAct, y); - if (alg) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", alg); - KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + if (controls) { + KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); } else { KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); } @@ -179,23 +203,6 @@ void test_spmv(const char *alg, const char *mode, const Alpha &alpha, Kokkos::deep_copy(hyExp, yExp); Kokkos::deep_copy(hyAct, yAct); - // max nnz per row is used for the tolerance - // for a transposed computation, need to transpose the matrix before - // seeing which rows are longest - size_t maxNnzPerRow; - if (mode_is_transpose(mode)) { - auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); - maxNnzPerRow = - at.blockDim() * - KokkosSparse::Impl::graph_max_degree( - at.graph.row_map); - } else { - maxNnzPerRow = - a.blockDim() * - KokkosSparse::Impl::graph_max_degree( - a.graph.row_map); - } - /* assume that any floating-point op may introduce eps() error scaling y is one op dot product of x is two ops per entry (mul and add) @@ -216,9 +223,16 @@ void test_spmv(const char *alg, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { + std::string alg; + if (controls) { + alg = controls->getParameter("algorithm", ""); + } else { + alg = ""; + } + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMV failure!" << std::endl; - std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "alg: " << alg << std::endl; std::cerr << "mode: " << mode << std::endl; std::cerr << "A: " << a.numRows() << "x" << a.numCols() << std::endl; @@ -367,18 +381,41 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) { /*! \brief test all combos of the provided matrix */ -template -void test_spmv_combos(const char *mode, const Bsr &a) { - using scalar_type = typename Bsr::non_const_value_type; +template +void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow) { + using scalar_type = typename Bsr::non_const_value_type; + using execution_space = typename Bsr::execution_space; auto [x, y] = random_vecs_for_spmv(mode, a); - for (auto alg : {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + // cover a variety of controls + using Ctrls = KokkosKernels::Experimental::Controls; + std::vector ctrls = {OptCtrls(), // no controls + OptCtrls(Ctrls()), // empty controls + OptCtrls(Ctrls({{"algorithm", "tpl"}})), + OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; + + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { +#if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) + ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); +#if defined(KOKKOS_ARCH_AMPERE) + ctrls.push_back(OptCtrls(Ctrls( + {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); +#endif // AMPERE +#endif // AMPERE || VOLTA + } +#endif // CUDA + } + + for (const auto &ctrl : ctrls) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(alg, mode, alpha, beta, a, x, y); + test_spmv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } @@ -390,11 +427,24 @@ template void test_spmv_corner_cases() { using Bsr = KokkosSparse::Experimental::BsrMatrix; + using Crs = KokkosSparse::CrsMatrix; for (auto mode : {"N", "T", "C", "H"}) { for (int bs : {1, 2, 5, 9}) { - test_spmv_combos(mode, bsr_corner_case_0_by_0(bs)); - test_spmv_combos(mode, bsr_corner_case_0_by_1(bs)); - test_spmv_combos(mode, bsr_corner_case_1_by_0(bs)); + { + auto A = bsr_corner_case_0_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spmv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_0_by_1(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spmv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_1_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spmv_combos(mode, A, Acrs, 0); + } } } } @@ -403,21 +453,37 @@ template void test_spmv_random() { using Bsr = KokkosSparse::Experimental::BsrMatrix; - for (auto mode : {"N", "T", "C", "H"}) { + using Crs = KokkosSparse::CrsMatrix; + // thoroughly test smaller matrices + std::vector> shapes = {{10, 10}, {10, 50}, {50, 10}}; + for (auto &shape : shapes) { for (int bs : {1, 2, 5, 9}) { - test_spmv_combos(mode, bsr_random(bs, 10, 10)); - test_spmv_combos(mode, bsr_random(bs, 10, 50)); - test_spmv_combos(mode, bsr_random(bs, 50, 10)); + auto A = bsr_random(bs, shape.first, shape.second); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T", "C", "H"}) { + test_spmv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } // test a tougher case on a big matrix - constexpr int blockSizePrime = 7; - constexpr int smallPrime = 11; - constexpr int largePrime = 499; - for (auto mode : {"N", "T"}) { - test_spmv_combos(mode, - bsr_random(blockSizePrime, smallPrime, largePrime)); + { + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + auto A = bsr_random(blockSizePrime, smallPrime, largePrime); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T"}) { + test_spmv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } @@ -431,30 +497,29 @@ void test_spmv() { // Multivector // ---------------------------------------------------------------------------- -template -void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, - const Beta &beta, const Bsr &a, const XVector &x, - const YVector &y) { - using execution_space = typename Bsr::execution_space; - using scalar_type = typename Bsr::non_const_value_type; - using ordinal_type = typename Bsr::non_const_ordinal_type; - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; +// Note: if mode_is_transpose(mode), then maxNnzPerRow is for A^T. Otherwise, +// it's for A. +template +void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow, const XVector &x, const YVector &y) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; - // generate expected result from reference implementation + // generate expected result from reference (CRS) implementation YVector yExp("yExp", y.extent(0), y.extent(1)); Kokkos::deep_copy(yExp, y); - reference_spmv(mode, alpha, a, x, beta, yExp); + KokkosSparse::spmv(mode, alpha, acrs, x, beta, yExp); // scratch space for actual value (don't modify input) YVector yAct("yAct", y.extent(0), y.extent(1)); Kokkos::deep_copy(yAct, y); - if (alg) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", alg); - KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + if (controls) { + KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); } else { KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); } @@ -465,23 +530,6 @@ void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, Kokkos::deep_copy(hyExp, yExp); Kokkos::deep_copy(hyAct, yAct); - // max nnz per row is used for the tolerance - // for a transposed computation, need to transpose the matrix before - // seeing which rows are longest - size_t maxNnzPerRow; - if (mode_is_transpose(mode)) { - auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); - maxNnzPerRow = - at.blockDim() * - KokkosSparse::Impl::graph_max_degree( - at.graph.row_map); - } else { - maxNnzPerRow = - a.blockDim() * - KokkosSparse::Impl::graph_max_degree( - a.graph.row_map); - } - /* assume that any floating-point op may introduce eps() error scaling y is one op dot product of x is two ops per entry (mul and add) @@ -502,9 +550,16 @@ void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { + std::string alg; + if (controls) { + alg = controls->getParameter("algorithm", ""); + } else { + alg = ""; + } + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMMV failure!" << std::endl; - std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "alg: " << alg << std::endl; std::cerr << "mode: " << mode << std::endl; std::cerr << "A: " << a.numRows() << "x" << a.numCols() << std::endl; @@ -563,19 +618,41 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, return std::make_tuple(x, y); } -template -void test_spm_mv_combos(const char *mode, const Bsr &a) { - using scalar_type = typename Bsr::non_const_value_type; +template +void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, + size_t maxNnzPerRow) { + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + + // cover a variety of controls + using Ctrls = KokkosKernels::Experimental::Controls; + std::vector ctrls = {OptCtrls(), // no controls + OptCtrls(Ctrls()), // empty controls + OptCtrls(Ctrls({{"algorithm", "tpl"}})), + OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; + + if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { +#if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) + ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); +#if defined(KOKKOS_ARCH_AMPERE) + ctrls.push_back(OptCtrls(Ctrls( + {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); +#endif // AMPERE +#endif // AMPERE || VOLTA + } +#endif // CUDA + } - for (size_t numVecs : {1, 2, 7}) { // num multivecs + for (size_t numVecs : {1, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); - for (auto alg : - {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (const auto &ctrl : ctrls) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(alg, mode, alpha, beta, a, x, y); + test_spm_mv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } @@ -589,11 +666,24 @@ template ; + using Crs = KokkosSparse::CrsMatrix; for (auto mode : {"N", "T", "C", "H"}) { for (int bs : {1, 2, 5, 9}) { - test_spm_mv_combos(mode, bsr_corner_case_0_by_0(bs)); - test_spm_mv_combos(mode, bsr_corner_case_0_by_1(bs)); - test_spm_mv_combos(mode, bsr_corner_case_1_by_0(bs)); + { + auto A = bsr_corner_case_0_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spm_mv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_0_by_1(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spm_mv_combos(mode, A, Acrs, 0); + } + { + auto A = bsr_corner_case_1_by_0(bs); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + test_spm_mv_combos(mode, A, Acrs, 0); + } } } } @@ -603,22 +693,37 @@ template ; + using Crs = KokkosSparse::CrsMatrix; // thoroughly test smaller matrices - for (auto mode : {"N", "T", "C", "H"}) { + std::vector> shapes = {{10, 10}, {10, 50}, {50, 10}}; + for (auto &shape : shapes) { for (int bs : {1, 2, 5, 9}) { - test_spm_mv_combos(mode, bsr_random(bs, 10, 10)); - test_spm_mv_combos(mode, bsr_random(bs, 10, 50)); - test_spm_mv_combos(mode, bsr_random(bs, 50, 10)); + auto A = bsr_random(bs, shape.first, shape.second); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T", "C", "H"}) { + test_spm_mv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } // test a tougher case on a big matrix - constexpr int blockSizePrime = 7; - constexpr int smallPrime = 11; - constexpr int largePrime = 499; - for (auto mode : {"N", "T"}) { - test_spm_mv_combos( - mode, bsr_random(blockSizePrime, smallPrime, largePrime)); + { + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + auto A = bsr_random(blockSizePrime, smallPrime, largePrime); + auto Acrs = KokkosSparse::Impl::bsr_to_crs(A); + size_t maxNnzPerRow = opMaxNnzPerRow(A, false); + size_t maxNnzPerRowTrans = opMaxNnzPerRow(A, true); + for (auto mode : {"N", "T"}) { + test_spm_mv_combos( + mode, A, Acrs, + mode_is_transpose(mode) ? maxNnzPerRowTrans : maxNnzPerRow); + } } } @@ -656,8 +761,7 @@ void test_spm_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \ - TestExecSpace) + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) #include @@ -668,8 +772,7 @@ void test_spm_mv() { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \ - TestExecSpace) + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) #include diff --git a/sparse/unit_test/Test_Sparse_trsv.hpp b/sparse/unit_test/Test_Sparse_trsv.hpp index e6bc13d6a0..d580cc472d 100644 --- a/sparse/unit_test/Test_Sparse_trsv.hpp +++ b/sparse/unit_test/Test_Sparse_trsv.hpp @@ -139,7 +139,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestDevice) #include @@ -152,7 +152,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestDevice) #include diff --git a/sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp b/sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp new file mode 100644 index 0000000000..d5c73f48d0 --- /dev/null +++ b/sparse/unit_test/backends/Test_Cuda_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_CUDA_BLOCKSPARSE_CPP +#define TEST_CUDA_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_CUDA_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_HIP_BlockSparse.cpp b/sparse/unit_test/backends/Test_HIP_BlockSparse.cpp new file mode 100644 index 0000000000..f040cbf2de --- /dev/null +++ b/sparse/unit_test/backends/Test_HIP_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_HIP_BLOCKSPARSE_CPP +#define TEST_HIP_BLOCKSPARSE_CPP + +#include "Test_HIP.hpp" +#include "Test_BlockSparse.hpp" + +#endif // TEST_HIP_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp b/sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp new file mode 100644 index 0000000000..7ea1bcf3f7 --- /dev/null +++ b/sparse/unit_test/backends/Test_OpenMPTarget_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMPTARGET_BLOCKSPARSE_CPP +#define TEST_OPENMPTARGET_BLOCKSPARSE_CPP + +#include "Test_OpenMPTarget.hpp" +#include "Test_BlockSparse.hpp" + +#endif // TEST_OPENMPTARGET_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp b/sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp new file mode 100644 index 0000000000..739ccf0a59 --- /dev/null +++ b/sparse/unit_test/backends/Test_OpenMP_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_BLOCKSPARSE_CPP +#define TEST_OPENMP_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_OPENMP_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp b/sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp new file mode 100644 index 0000000000..3f80795f9f --- /dev/null +++ b/sparse/unit_test/backends/Test_SYCL_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_BLOCKSPARSE_CPP +#define TEST_SYCL_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_SYCL_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_Serial_BlockSparse.cpp b/sparse/unit_test/backends/Test_Serial_BlockSparse.cpp new file mode 100644 index 0000000000..69194c0669 --- /dev/null +++ b/sparse/unit_test/backends/Test_Serial_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_BLOCKSPARSE_CPP +#define TEST_SERIAL_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_SERIAL_BLOCKSPARSE_CPP diff --git a/sparse/unit_test/backends/Test_Threads_BlockSparse.cpp b/sparse/unit_test/backends/Test_Threads_BlockSparse.cpp new file mode 100644 index 0000000000..8ec1c442f6 --- /dev/null +++ b/sparse/unit_test/backends/Test_Threads_BlockSparse.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_BLOCKSPARSE_CPP +#define TEST_THREADS_BLOCKSPARSE_CPP + +#include +#include + +#endif // TEST_THREADS_BLOCKSPARSE_CPP diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 95a3459699..236bcdd1c8 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -411,6 +411,7 @@ class epsilon { constexpr static double value = std::numeric_limits::epsilon(); }; +#if KOKKOS_VERSION < 40199 // explicit epsilon specializations #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT template <> @@ -428,6 +429,7 @@ class epsilon { constexpr static double value = 0.0078125F; }; #endif // KOKKOS_HALF_T_IS_FLOAT +#endif // KOKKOS_VERSION < 40199 using KokkosKernels::Impl::getRandomBounds; @@ -574,13 +576,14 @@ int string_compare_no_case(const std::string& str1, const std::string& str2) { /// /brief Coo matrix class for testing purposes. /// \tparam ScalarType /// \tparam LayoutType -/// \tparam ExeSpaceType -template +/// \tparam Device +template class RandCooMat { private: - using RowViewTypeD = Kokkos::View; - using ColViewTypeD = Kokkos::View; - using DataViewTypeD = Kokkos::View; + using ExeSpaceType = typename Device::execution_space; + using RowViewTypeD = Kokkos::View; + using ColViewTypeD = Kokkos::View; + using DataViewTypeD = Kokkos::View; RowViewTypeD __row_d; ColViewTypeD __col_d; DataViewTypeD __data_d; diff --git a/test_common/Test_Common_Test_All_Type_Combos.hpp b/test_common/Test_Common_Test_All_Type_Combos.hpp index c51601fdf4..a51d796632 100644 --- a/test_common/Test_Common_Test_All_Type_Combos.hpp +++ b/test_common/Test_Common_Test_All_Type_Combos.hpp @@ -31,26 +31,25 @@ // ETI is off, test all possible type combos -KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestDevice) #if !defined(NO_TEST_COMPLEX) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestDevice) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestDevice) #endif @@ -61,49 +60,49 @@ KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestDevice) #endif #if !defined(NO_TEST_COMPLEX) @@ -111,50 +110,49 @@ KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestDevice) #endif #endif // !NO_TEST_COMPLEX diff --git a/test_common/Test_Cuda.hpp b/test_common/Test_Cuda.hpp index 0bfe35718b..cf1042a2c4 100644 --- a/test_common/Test_Cuda.hpp +++ b/test_common/Test_Cuda.hpp @@ -32,6 +32,27 @@ class Cuda : public ::testing::Test { }; #define TestCategory Cuda -#define TestExecSpace Kokkos::Cuda + +using CudaSpaceDevice = Kokkos::Device; +using CudaUVMSpaceDevice = Kokkos::Device; + +#ifdef KOKKOS_ENABLE_CUDA_UVM +// KOKKOS_ENABLE_CUDA_UVM macro and cmake option is deprecated +// But if it is defined, test with CudaUVMSpace. +// Make sure it's instantiated first: +#if defined(KOKKOSKERNELS_TEST_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +#error \ + "Deprecated option KOKKOS_ENABLE_CUDA_UVM is defined, so KokkosKernels will test with CudaUVMSpace. " \ + "KokkosKernels_INST_MEMSPACE_CUDAUVMSPACE=ON must be set in configuration." +#endif +#define TestDevice CudaUVMSpaceDevice +// Prefer for any testing where only one exec space is used +#elif defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) && \ + !defined(KOKKOSKERNELS_INST_MEMSPACE_CUDASPACE) +#define TestDevice CudaUVMSpaceDevice +#else +#define TestDevice CudaSpaceDevice +#endif #endif // TEST_CUDA_HPP diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index 7e61bfc9c3..c9e02698c5 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -32,6 +32,6 @@ class hip : public ::testing::Test { }; #define TestCategory hip -#define TestExecSpace Kokkos::Experimental::HIP +#define TestDevice Kokkos::HIP #endif // TEST_HIP_HPP diff --git a/test_common/Test_OpenMP.hpp b/test_common/Test_OpenMP.hpp index 8b4f90730e..3d110e4479 100644 --- a/test_common/Test_OpenMP.hpp +++ b/test_common/Test_OpenMP.hpp @@ -32,6 +32,6 @@ class openmp : public ::testing::Test { }; #define TestCategory openmp -#define TestExecSpace Kokkos::OpenMP +#define TestDevice Kokkos::OpenMP #endif // TEST_OPENMP_HPP diff --git a/test_common/Test_OpenMPTarget.hpp b/test_common/Test_OpenMPTarget.hpp index 2056d8be01..d41f95dad1 100644 --- a/test_common/Test_OpenMPTarget.hpp +++ b/test_common/Test_OpenMPTarget.hpp @@ -32,6 +32,6 @@ class openmptarget : public ::testing::Test { }; #define TestCategory openmptarget -#define TestExecSpace Kokkos::Experimental::OpenMPTarget +#define TestDevice Kokkos::Experimental::OpenMPTarget #endif // TEST_OPENMPTARGET_HPP diff --git a/test_common/Test_SYCL.hpp b/test_common/Test_SYCL.hpp index c7022f35d1..493b8082a4 100644 --- a/test_common/Test_SYCL.hpp +++ b/test_common/Test_SYCL.hpp @@ -29,4 +29,4 @@ class sycl_test : public ::testing::Test { }; #define TestCategory sycl_test -#define TestExecSpace Kokkos::Experimental::SYCL +#define TestDevice Kokkos::Experimental::SYCL diff --git a/test_common/Test_Serial.hpp b/test_common/Test_Serial.hpp index fe2917937b..aca218cade 100644 --- a/test_common/Test_Serial.hpp +++ b/test_common/Test_Serial.hpp @@ -32,6 +32,6 @@ class serial : public ::testing::Test { }; #define TestCategory serial -#define TestExecSpace Kokkos::Serial +#define TestDevice Kokkos::Serial #endif // TEST_SERIAL_HPP diff --git a/test_common/Test_Threads.hpp b/test_common/Test_Threads.hpp index 1e2919b68f..d527023c8f 100644 --- a/test_common/Test_Threads.hpp +++ b/test_common/Test_Threads.hpp @@ -32,6 +32,6 @@ class threads : public ::testing::Test { }; #define TestCategory threads -#define TestExecSpace Kokkos::Threads +#define TestDevice Kokkos::Threads #endif // TEST_THREADS_HPP