Skip to content

Commit

Permalink
Merge pull request #1617 from kokkos/release-candidate-3.7.01
Browse files Browse the repository at this point in the history
Release 3.7.01
  • Loading branch information
crtrott authored Dec 6, 2022
2 parents 04821ac + 912b0d1 commit ddf1b3d
Show file tree
Hide file tree
Showing 10 changed files with 315 additions and 37 deletions.
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
# Change Log

## [3.7.01](https://github.com/kokkos/kokkos-kernels/tree/3.7.01) (2022-12-01)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.00...3.7.01)

### Bug Fixes:

- Change template type for StaticCrsGraph in BsrMatrix [\#1531](https://github.com/kokkos/kokkos/pull/1531)
- Remove listing of undefined TPL deps [\#1568](https://github.com/kokkos/kokkos/pull/1568)
- Fix using SpGEMM with nonstandard scalar type, with MKL enabled [\#1591](https://github.com/kokkos/kokkos/pull/1591)
- Move destroying dense vector descriptors out of cuSparse sptrsv handle [\#1590](https://github.com/kokkos/kokkos/pull/1590)
- Fix `cuda_data_type_from` to return `CUDA_C_64F` for `Kokkos::complex<double>` [\#1604](https://github.com/kokkos/kokkos/pull/1604)
- Disable compile-time check in cuda_data_type_from on supported scalar types for cuSPARSE [\#1605](https://github.com/kokkos/kokkos/pull/1605)
- Reduce register pressure in batched dense algorithms [\#1588](https://github.com/kokkos/kokkos/pull/1588)

### Implemented enhancements:

- Use new cusparseSpSV TPL for SPTRSV when cuSPARSE is enabled with CUDA >= 11.3 [\#1574](https://github.com/kokkos/kokkos/pull/1574)

## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00)

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS)
ENDIF()
SET(KokkosKernels_VERSION_MAJOR 3)
SET(KokkosKernels_VERSION_MINOR 7)
SET(KokkosKernels_VERSION_PATCH 00)
SET(KokkosKernels_VERSION_PATCH 01)
SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")
MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}")
ENDIF()
Expand Down
2 changes: 1 addition & 1 deletion cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS
LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS
TEST_OPTIONAL_TPLS yaml-cpp
)
# NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in
Expand Down
1 change: 1 addition & 0 deletions master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d
tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4
tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42
tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa
tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6
59 changes: 47 additions & 12 deletions src/batched/dense/KokkosBatched_Gemm_Decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,42 @@ template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
int tile_m, int tile_n, int tile_k>
class BatchedDblBufGemm;

//////////////////////////////// tile_m //////////////////////////////////
template <typename ExecutionSpace>
constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() {
return 32;
}
//////////////////////////////// tile_n //////////////////////////////////
template <typename ExecutionSpace>
constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() {
return 32;
}
//////////////////////////////// tile_k //////////////////////////////////
template <typename ExecutionSpace>
constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() {
return 8;
}

// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails
// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547.
// This reduces the register allocations (REG_M and REG_N) in the double
// buffering algorithm by a factor of 2.
#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908)
template <>
constexpr KOKKOS_INLINE_FUNCTION int
kk_gemm_dlb_buf_tile_k<Kokkos::Experimental::HIP>() {
return 16;
}
#endif
////////////////////////// alpha_in_fma_thresh ////////////////////////////
constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() {
#ifdef __CUDACC_RDC__
return 24;
#else
return 64;
#endif // __CUDAACC_RDC__
}

// clang-format off
/// \brief Blocking solve of general matrix multiply on a batch of uniform matrices.
///
Expand Down Expand Up @@ -458,19 +494,19 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
// Begin checking conditions for optimal BatchedGemm invocation.
using view_scalar_type = typename CViewType::value_type;
using layout_type = typename CViewType::array_layout;
using exec_space = typename CViewType::execution_space;
constexpr bool is_vector = KokkosBatched::is_vector<view_scalar_type>::value;
constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<
typename CViewType::execution_space>();
constexpr bool on_gpu =
KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>();
constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space<
typename CViewType::execution_space::memory_space>();
typename exec_space::memory_space>();
constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space<
typename CViewType::execution_space::memory_space>();
typename exec_space::memory_space>();

if (handle->enableDebug) {
std::cout << "view_scalar_type:" << typeid(view_scalar_type).name()
<< std::endl
<< "execution_space:"
<< typeid(typename CViewType::execution_space).name() << std::endl
<< "execution_space:" << typeid(exec_space).name() << std::endl
<< std::endl
<< "is_vector:" << is_vector << std::endl
<< "on_gpu:" << on_gpu << std::endl
Expand Down Expand Up @@ -521,12 +557,11 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
? (c_m >= 16)
: (c_m >= 24 && c_m <= 32) || c_m >= 40)) {
handle->teamSz = handle->vecLen = 8;
constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
#ifdef __CUDACC_RDC__
constexpr size_t alpha_in_fma_thresh = 24;
#else
constexpr size_t alpha_in_fma_thresh = 64;
#endif // __CUDAACC_RDC__
constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m<exec_space>();
constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n<exec_space>();
constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k<exec_space>();
constexpr size_t alpha_in_fma_thresh =
Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh();

if (c_m % 32 == 0) { // No bounds checking
if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma
Expand Down
8 changes: 4 additions & 4 deletions src/sparse/KokkosSparse_BsrMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,12 +390,12 @@ class BsrMatrix {
typedef BsrMatrix<ScalarType, OrdinalType, host_mirror_space, MemoryTraits>
HostMirror;
//! Type of the graph structure of the sparse matrix.
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
execution_space, memory_traits, size_type>
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
memory_traits, size_type>
StaticCrsGraphType;
//! Type of the graph structure of the sparse matrix - consistent with Kokkos.
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft,
execution_space, memory_traits, size_type>
typedef Kokkos::StaticCrsGraph<ordinal_type, Kokkos::LayoutLeft, device_type,
memory_traits, size_type>
staticcrsgraph_type;
//! Type of column indices in the sparse matrix.
typedef typename staticcrsgraph_type::entries_type index_type;
Expand Down
9 changes: 6 additions & 3 deletions src/sparse/KokkosSparse_Utils_cusparse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,12 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus,

template <typename T>
cudaDataType cuda_data_type_from() {
// Note: compile-time failure is disabled to allow for packages such as
// Ifpack2 to more easily support scalar types that cuSPARSE may not.

// compile-time failure with a nice message if called on an unsupported type
static_assert(!std::is_same<T, T>::value,
"cuSparse TPL does not support scalar type");
// static_assert(!std::is_same<T, T>::value,
// "cuSparse TPL does not support scalar type");
// static_assert(false, ...) is allowed to error even if the code is not
// instantiated. obfuscate the predicate Despite this function being
// uncompilable, the compiler may decide that a return statement is missing,
Expand Down Expand Up @@ -151,7 +154,7 @@ inline cudaDataType cuda_data_type_from<Kokkos::complex<float>>() {
}
template <>
inline cudaDataType cuda_data_type_from<Kokkos::complex<double>>() {
return CUDA_C_32F;
return CUDA_C_64F;
}

#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION)
Expand Down
18 changes: 11 additions & 7 deletions src/sparse/KokkosSparse_Utils_mkl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,24 +123,28 @@ template <typename value_type>
class MKLSparseMatrix {
sparse_matrix_t mtx;

static_assert(mkl_is_supported_value_type<value_type>::value,
"Scalar type used in MKLSparseMatrix<value_type> is NOT "
"supported by MKL");

public:
inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {}

// Constructs MKL sparse matrix from KK sparse views (m rows x n cols)
inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols,
MKL_INT *xadj, MKL_INT *adj, value_type *values);
MKL_INT *xadj, MKL_INT *adj, value_type *values) {
throw std::runtime_error(
"Scalar type used in MKLSparseMatrix<value_type> is NOT "
"supported by MKL");
}

// Allows using MKLSparseMatrix directly in MKL calls
inline operator sparse_matrix_t() const { return mtx; }

// Exports MKL sparse matrix contents into KK views
inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols,
MKL_INT *&rows_start, MKL_INT *&columns,
value_type *&values);
value_type *&values) {
throw std::runtime_error(
"Scalar type used in MKLSparseMatrix<value_type> is NOT "
"supported by MKL");
}

inline void destroy() {
KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx));
Expand Down Expand Up @@ -256,4 +260,4 @@ inline void MKLSparseMatrix<Kokkos::complex<double>>::export_data(

#endif // KOKKOSKERNELS_ENABLE_TPL_MKL

#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP
57 changes: 55 additions & 2 deletions src/sparse/KokkosSparse_sptrsv_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
#define KOKKOSSPARSE_SPTRSVHANDLE_HPP

#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
#include "cusparse.h"
#include "KokkosSparse_Utils_cusparse.hpp"
#endif

#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
Expand Down Expand Up @@ -108,6 +108,8 @@ class SPTRSVHandle {
typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
int_row_view_t;
typedef typename Kokkos::View<int64_t *, HandlePersistentMemorySpace>
int64_row_view_t;
// typedef typename row_lno_persistent_work_view_t::HostMirror
// row_lno_persistent_work_host_view_t; //Host view type
typedef typename Kokkos::View<
Expand Down Expand Up @@ -154,6 +156,42 @@ class SPTRSVHandle {
mtx_scalar_view_t;

#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
#if (CUDA_VERSION >= 11030)
struct cuSparseHandleType {
cusparseHandle_t handle;
cusparseOperation_t transpose;
cusparseSpMatDescr_t matDescr;
cusparseDnVecDescr_t vecBDescr, vecBDescr_dummy;
cusparseDnVecDescr_t vecXDescr, vecXDescr_dummy;
cusparseSpSVDescr_t spsvDescr;
void *pBuffer{nullptr};

cuSparseHandleType(bool transpose_, bool is_lower) {
KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle));

KOKKOS_CUSPARSE_SAFE_CALL(
cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));

if (transpose_) {
transpose = CUSPARSE_OPERATION_TRANSPOSE;
} else {
transpose = CUSPARSE_OPERATION_NON_TRANSPOSE;
}

KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_createDescr(&spsvDescr));
}

~cuSparseHandleType() {
if (pBuffer != nullptr) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(pBuffer));
pBuffer = nullptr;
}
KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(matDescr));
KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_destroyDescr(spsvDescr));
KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
}
};
#else // CUDA_VERSION < 11030
struct cuSparseHandleType {
cusparseHandle_t handle;
cusparseOperation_t transpose;
Expand Down Expand Up @@ -202,6 +240,7 @@ class SPTRSVHandle {
cusparseDestroy(handle);
}
};
#endif

typedef cuSparseHandleType SPTRSVcuSparseHandleType;
#endif
Expand Down Expand Up @@ -337,6 +376,7 @@ class SPTRSVHandle {
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
SPTRSVcuSparseHandleType *cuSPARSEHandle;
int_row_view_t tmp_int_rowmap;
int64_row_view_t tmp_int64_rowmap;
#endif

#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
Expand Down Expand Up @@ -443,7 +483,8 @@ class SPTRSVHandle {
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
,
cuSPARSEHandle(nullptr),
tmp_int_rowmap()
tmp_int_rowmap(),
tmp_int64_rowmap()
#endif
#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
,
Expand Down Expand Up @@ -851,6 +892,18 @@ class SPTRSVHandle {
}
int_row_view_t get_int_rowmap_view() { return tmp_int_rowmap; }
int *get_int_rowmap_ptr() { return tmp_int_rowmap.data(); }

void allocate_tmp_int64_rowmap(size_type N) {
tmp_int64_rowmap = int64_row_view_t(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "tmp_int64_rowmap"), N);
}
template <typename RowViewType>
int64_t *get_int64_rowmap_ptr_copy(const RowViewType &rowmap) {
Kokkos::deep_copy(tmp_int64_rowmap, rowmap);
Kokkos::fence();
return tmp_int64_rowmap.data();
}
int64_t *get_int64_rowmap_ptr() { return tmp_int64_rowmap.data(); }
#endif

bool algm_requires_symb_lvlsched() const {
Expand Down
Loading

0 comments on commit ddf1b3d

Please sign in to comment.