Merge pull request #1617 from kokkos/release-candidate-3.7.01

Release 3.7.01
kokkos · Dec 6, 2022 · ddf1b3d · ddf1b3d
2 parents 04821ac + 912b0d1
commit ddf1b3d
Show file tree

Hide file tree

Showing 10 changed files with 315 additions and 37 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Change Log
 
+## [3.7.01](https://github.com/kokkos/kokkos-kernels/tree/3.7.01) (2022-12-01)
+[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.00...3.7.01)
+
+### Bug Fixes:
+
+- Change template type for StaticCrsGraph in BsrMatrix [\#1531](https://github.com/kokkos/kokkos/pull/1531)
+- Remove listing of undefined TPL deps [\#1568](https://github.com/kokkos/kokkos/pull/1568)
+- Fix using SpGEMM with nonstandard scalar type, with MKL enabled [\#1591](https://github.com/kokkos/kokkos/pull/1591)
+- Move destroying dense vector descriptors out of cuSparse sptrsv handle [\#1590](https://github.com/kokkos/kokkos/pull/1590)
+- Fix `cuda_data_type_from` to return `CUDA_C_64F` for `Kokkos::complex<double>` [\#1604](https://github.com/kokkos/kokkos/pull/1604)
+- Disable compile-time check in cuda_data_type_from on supported scalar types for cuSPARSE [\#1605](https://github.com/kokkos/kokkos/pull/1605)
+- Reduce register pressure in batched dense algorithms [\#1588](https://github.com/kokkos/kokkos/pull/1588)
+
+### Implemented enhancements:
+
+- Use new cusparseSpSV TPL for SPTRSV when cuSPARSE is enabled with CUDA >= 11.3 [\#1574](https://github.com/kokkos/kokkos/pull/1574)
+
 ## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18)
 [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00)
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
   ENDIF()
   SET(KokkosKernels_VERSION_MAJOR 3)
   SET(KokkosKernels_VERSION_MINOR 7)
-  SET(KokkosKernels_VERSION_PATCH 00)
+  SET(KokkosKernels_VERSION_PATCH 01)
   SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")
   MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}")
 ENDIF()

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
         LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
-        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS
+        LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS
         TEST_OPTIONAL_TPLS yaml-cpp
 )
 # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in

diff --git a/master_history.txt b/master_history.txt
@@ -18,3 +18,4 @@ tag: 3.5.00     date: 11/19/2021  master: 00189c0b    release: f171533d
 tag: 3.6.00     date: 04/06/2022  master: 8381db04    release: a7e683c4
 tag: 3.6.01     date: 05/23/2022  master: e09389ae    release: e1d8de42
 tag: 3.7.00     date: 08/25/2022  master: 42ab7a29    release: 9cc88ffa
+tag: 3.7.01     date: 12/01/2022  master: 04821ac3    release: 6cb632b6
diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -259,6 +259,42 @@ template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
           int tile_m, int tile_n, int tile_k>
 class BatchedDblBufGemm;
 
+//////////////////////////////// tile_m //////////////////////////////////
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() {
+  return 32;
+}
+//////////////////////////////// tile_n //////////////////////////////////
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() {
+  return 32;
+}
+//////////////////////////////// tile_k //////////////////////////////////
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() {
+  return 8;
+}
+
+// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails
+// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547.
+// This reduces the register allocations (REG_M and REG_N) in the double
+// buffering algorithm by a factor of 2.
+#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908)
+template <>
+constexpr KOKKOS_INLINE_FUNCTION int
+kk_gemm_dlb_buf_tile_k<Kokkos::Experimental::HIP>() {
+  return 16;
+}
+#endif
+////////////////////////// alpha_in_fma_thresh ////////////////////////////
+constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() {
+#ifdef __CUDACC_RDC__
+  return 24;
+#else
+  return 64;
+#endif  // __CUDAACC_RDC__
+}
+
 // clang-format off
 /// \brief Blocking solve of general matrix multiply on a batch of uniform matrices.
 ///
@@ -458,19 +494,19 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
   // Begin checking conditions for optimal BatchedGemm invocation.
   using view_scalar_type   = typename CViewType::value_type;
   using layout_type        = typename CViewType::array_layout;
+  using exec_space         = typename CViewType::execution_space;
   constexpr bool is_vector = KokkosBatched::is_vector<view_scalar_type>::value;
-  constexpr bool on_gpu    = KokkosKernels::Impl::kk_is_gpu_exec_space<
-      typename CViewType::execution_space>();
+  constexpr bool on_gpu =
+      KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>();
   constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space<
-      typename CViewType::execution_space::memory_space>();
+      typename exec_space::memory_space>();
   constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space<
-      typename CViewType::execution_space::memory_space>();
+      typename exec_space::memory_space>();
 
   if (handle->enableDebug) {
     std::cout << "view_scalar_type:" << typeid(view_scalar_type).name()
               << std::endl
-              << "execution_space:"
-              << typeid(typename CViewType::execution_space).name() << std::endl
+              << "execution_space:" << typeid(exec_space).name() << std::endl
               << std::endl
               << "is_vector:" << is_vector << std::endl
               << "on_gpu:" << on_gpu << std::endl
@@ -521,12 +557,11 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
                          ? (c_m >= 16)
                          : (c_m >= 24 && c_m <= 32) || c_m >= 40)) {
         handle->teamSz = handle->vecLen = 8;
-        constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
-#ifdef __CUDACC_RDC__
-        constexpr size_t alpha_in_fma_thresh = 24;
-#else
-        constexpr size_t alpha_in_fma_thresh = 64;
-#endif  // __CUDAACC_RDC__
+        constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m<exec_space>();
+        constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n<exec_space>();
+        constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k<exec_space>();
+        constexpr size_t alpha_in_fma_thresh =
+            Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh();
 
         if (c_m % 32 == 0) {                 // No bounds checking
           if (c_m >= alpha_in_fma_thresh) {  // apply alpha in fma

diff --git a/src/sparse/KokkosSparse_BsrMatrix.hpp b/src/sparse/KokkosSparse_BsrMatrix.hpp
@@ -390,12 +390,12 @@ class BsrMatrix {
   typedef BsrMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits>
       HostMirror;
   //! Type of the graph structure of the sparse matrix.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
-                                 execution_space, memory_traits, size_type>
+  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
+                                 memory_traits, size_type>
       StaticCrsGraphType;
   //! Type of the graph structure of the sparse matrix - consistent with Kokkos.
-  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
-                                 execution_space, memory_traits, size_type>
+  typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
+                                 memory_traits, size_type>
       staticcrsgraph_type;
   //! Type of column indices in the sparse matrix.
   typedef typename staticcrsgraph_type::entries_type index_type;

diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp
@@ -116,9 +116,12 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus,
 
 template <typename T>
 cudaDataType cuda_data_type_from() {
+  // Note:  compile-time failure is disabled to allow for packages such as
+  // Ifpack2 to more easily support scalar types that cuSPARSE may not.
+
   // compile-time failure with a nice message if called on an unsupported type
-  static_assert(!std::is_same<T, T>::value,
-                "cuSparse TPL does not support scalar type");
+  // static_assert(!std::is_same<T, T>::value,
+  //               "cuSparse TPL does not support scalar type");
   // static_assert(false, ...) is allowed to error even if the code is not
   // instantiated. obfuscate the predicate Despite this function being
   // uncompilable, the compiler may decide that a return statement is missing,
@@ -151,7 +154,7 @@ inline cudaDataType cuda_data_type_from<Kokkos::complex<float>>() {
 }
 template <>
 inline cudaDataType cuda_data_type_from<Kokkos::complex<double>>() {
-  return CUDA_C_32F;
+  return CUDA_C_64F;
 }
 
 #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)

diff --git a/src/sparse/KokkosSparse_Utils_mkl.hpp b/src/sparse/KokkosSparse_Utils_mkl.hpp
@@ -123,24 +123,28 @@ template <typename value_type>
 class MKLSparseMatrix {
   sparse_matrix_t mtx;
 
-  static_assert(mkl_is_supported_value_type<value_type>::value,
-                "Scalar type used in MKLSparseMatrix<value_type> is NOT "
-                "supported by MKL");
-
  public:
   inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {}
 
   // Constructs MKL sparse matrix from KK sparse views (m rows x n cols)
   inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols,
-                         MKL_INT *xadj, MKL_INT *adj, value_type *values);
+                         MKL_INT *xadj, MKL_INT *adj, value_type *values) {
+    throw std::runtime_error(
+        "Scalar type used in MKLSparseMatrix<value_type> is NOT "
+        "supported by MKL");
+  }
 
   // Allows using MKLSparseMatrix directly in MKL calls
   inline operator sparse_matrix_t() const { return mtx; }
 
   // Exports MKL sparse matrix contents into KK views
   inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols,
                           MKL_INT *&rows_start, MKL_INT *&columns,
-                          value_type *&values);
+                          value_type *&values) {
+    throw std::runtime_error(
+        "Scalar type used in MKLSparseMatrix<value_type> is NOT "
+        "supported by MKL");
+  }
 
   inline void destroy() {
     KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx));
@@ -256,4 +260,4 @@ inline void MKLSparseMatrix<Kokkos::complex<double>>::export_data(
 
 #endif  // KOKKOSKERNELS_ENABLE_TPL_MKL
 
-#endif  // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
+#endif  // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp
@@ -50,7 +50,7 @@
 #define KOKKOSSPARSE_SPTRSVHANDLE_HPP
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-#include "cusparse.h"
+#include "KokkosSparse_Utils_cusparse.hpp"
 #endif
 
 #if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
@@ -108,6 +108,8 @@ class SPTRSVHandle {
   typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
   typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
       int_row_view_t;
+  typedef typename Kokkos::View<int64_t *, HandlePersistentMemorySpace>
+      int64_row_view_t;
   // typedef typename row_lno_persistent_work_view_t::HostMirror
   // row_lno_persistent_work_host_view_t; //Host view type
   typedef typename Kokkos::View<
@@ -154,6 +156,42 @@ class SPTRSVHandle {
       mtx_scalar_view_t;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if (CUDA_VERSION >= 11030)
+  struct cuSparseHandleType {
+    cusparseHandle_t handle;
+    cusparseOperation_t transpose;
+    cusparseSpMatDescr_t matDescr;
+    cusparseDnVecDescr_t vecBDescr, vecBDescr_dummy;
+    cusparseDnVecDescr_t vecXDescr, vecXDescr_dummy;
+    cusparseSpSVDescr_t spsvDescr;
+    void *pBuffer{nullptr};
+
+    cuSparseHandleType(bool transpose_, bool is_lower) {
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle));
+
+      KOKKOS_CUSPARSE_SAFE_CALL(
+          cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
+
+      if (transpose_) {
+        transpose = CUSPARSE_OPERATION_TRANSPOSE;
+      } else {
+        transpose = CUSPARSE_OPERATION_NON_TRANSPOSE;
+      }
+
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_createDescr(&spsvDescr));
+    }
+
+    ~cuSparseHandleType() {
+      if (pBuffer != nullptr) {
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(pBuffer));
+        pBuffer = nullptr;
+      }
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(matDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_destroyDescr(spsvDescr));
+      KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
+    }
+  };
+#else  // CUDA_VERSION < 11030
   struct cuSparseHandleType {
     cusparseHandle_t handle;
     cusparseOperation_t transpose;
@@ -202,6 +240,7 @@ class SPTRSVHandle {
       cusparseDestroy(handle);
     }
   };
+#endif
 
   typedef cuSparseHandleType SPTRSVcuSparseHandleType;
 #endif
@@ -337,6 +376,7 @@ class SPTRSVHandle {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   SPTRSVcuSparseHandleType *cuSPARSEHandle;
   int_row_view_t tmp_int_rowmap;
+  int64_row_view_t tmp_int64_rowmap;
 #endif
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
@@ -443,7 +483,8 @@ class SPTRSVHandle {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
         ,
         cuSPARSEHandle(nullptr),
-        tmp_int_rowmap()
+        tmp_int_rowmap(),
+        tmp_int64_rowmap()
 #endif
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
         ,
@@ -851,6 +892,18 @@ class SPTRSVHandle {
   }
   int_row_view_t get_int_rowmap_view() { return tmp_int_rowmap; }
   int *get_int_rowmap_ptr() { return tmp_int_rowmap.data(); }
+
+  void allocate_tmp_int64_rowmap(size_type N) {
+    tmp_int64_rowmap = int64_row_view_t(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int64_rowmap"), N);
+  }
+  template <typename RowViewType>
+  int64_t *get_int64_rowmap_ptr_copy(const RowViewType &rowmap) {
+    Kokkos::deep_copy(tmp_int64_rowmap, rowmap);
+    Kokkos::fence();
+    return tmp_int64_rowmap.data();
+  }
+  int64_t *get_int64_rowmap_ptr() { return tmp_int64_rowmap.data(); }
 #endif
 
   bool algm_requires_symb_lvlsched() const {